comparison plotWheels/core.py @ 1:9b276485c94a draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:44:43 +0000
parents
children
comparison
equal deleted inserted replaced
0:9caa9aa44fd8 1:9b276485c94a
1 # -*- coding: utf-8 -*-
2 """
3 .. currentmodule:: modlamp.core
4
5 .. moduleauthor:: modlab Alex Mueller ETH Zurich <alex.mueller@pharma.ethz.ch>
6
7 Core helper functions and classes for other modules. The two main classes are:
8
9 ============================= =======================================================================================
10 Class Characteristics
11 ============================= =======================================================================================
12 :py:class:`BaseSequence` Base class inheriting to all sequence classes in the module :py:mod:`modlamp.sequences`
13 :py:class:`BaseDescriptor` Base class inheriting to the two descriptor classes in :py:mod:`modlamp.descriptors`
14 ============================= =======================================================================================
15 """
16
17 import os
18 import random
19 import re
20
21 import numpy as np
22 import pandas as pd
23 import collections
24 import operator
25 from scipy.spatial import distance
26 from sklearn.preprocessing import MinMaxScaler, StandardScaler
27 from sklearn.utils import shuffle
28
29 __author__ = "Alex Müller, Gisela Gabernet"
30 __docformat__ = "restructuredtext en"
31
32
33 class BaseSequence(object):
34 """Base class for sequence classes in the module :mod:`modlamp.sequences`.
35 It contains amino acid probabilities for different sequence generation classes.
36
37 The following amino acid probabilities are used: (extracted from the
38 `APD3 <http://aps.unmc.edu/AP/statistic/statistic.php>`_, March 17, 2016)
39
40 === ==== ====== ========= ==========
41 AA rand AMP AMPnoCM randnoCM
42 === ==== ====== ========= ==========
43 A 0.05 0.0766 0.0812275 0.05555555
44 C 0.05 0.071 0.0 0.0
45 D 0.05 0.026 0.0306275 0.05555555
46 E 0.05 0.0264 0.0310275 0.05555555
47 F 0.05 0.0405 0.0451275 0.05555555
48 G 0.05 0.1172 0.1218275 0.05555555
49 H 0.05 0.021 0.0256275 0.05555555
50 I 0.05 0.061 0.0656275 0.05555555
51 K 0.05 0.0958 0.1004275 0.05555555
52 L 0.05 0.0838 0.0884275 0.05555555
53 M 0.05 0.0123 0.0 0.0
54 N 0.05 0.0386 0.0432275 0.05555555
55 P 0.05 0.0463 0.0509275 0.05555555
56 Q 0.05 0.0251 0.0297275 0.05555555
57 R 0.05 0.0545 0.0591275 0.05555555
58 S 0.05 0.0613 0.0659275 0.05555555
59 T 0.05 0.0455 0.0501275 0.05555555
60 V 0.05 0.0572 0.0618275 0.05555555
61 W 0.05 0.0155 0.0201275 0.05555555
62 Y 0.05 0.0244 0.0290275 0.05555555
63 === ==== ====== ========= ==========
64
65 """
66
67 def __init__(self, seqnum, lenmin=7, lenmax=28):
68 """
69 :param seqnum: number of sequences to generate
70 :param lenmin: minimal length of the generated sequences
71 :param lenmax: maximal length of the generated sequences
72 :return: attributes :py:attr:`seqnum`, :py:attr:`lenmin` and :py:attr:`lenmax`.
73 :Example:
74
75 >>> b = BaseSequence(10, 7, 28)
76 >>> b.seqnum
77 10
78 >>> b.lenmin
79 7
80 >>> b.lenmax
81 28
82 """
83 self.sequences = list()
84 self.names = list()
85 self.lenmin = int(lenmin)
86 self.lenmax = int(lenmax)
87 self.seqnum = int(seqnum)
88
89 # AA classes:
90 self.AA_hyd = ["G", "A", "L", "I", "V"]
91 self.AA_basic = ["K", "R"]
92 self.AA_acidic = ["D", "E"]
93 self.AA_aroma = ["W", "Y", "F"]
94 self.AA_polar = ["S", "T", "Q", "N"]
95 # AA labels:
96 self.AAs = [
97 "A",
98 "C",
99 "D",
100 "E",
101 "F",
102 "G",
103 "H",
104 "I",
105 "K",
106 "L",
107 "M",
108 "N",
109 "P",
110 "Q",
111 "R",
112 "S",
113 "T",
114 "V",
115 "W",
116 "Y",
117 ]
118 # AA probability from the APD3 database:
119 self.prob_AMP = [
120 0.0766,
121 0.071,
122 0.026,
123 0.0264,
124 0.0405,
125 0.1172,
126 0.021,
127 0.061,
128 0.0958,
129 0.0838,
130 0.0123,
131 0.0386,
132 0.0463,
133 0.0251,
134 0.0545,
135 0.0613,
136 0.0455,
137 0.0572,
138 0.0155,
139 0.0244,
140 ]
141 # AA probability from the APD2 database without Cys and Met (synthesis reasons)
142 self.prob_AMPnoCM = [
143 0.081228,
144 0.0,
145 0.030627,
146 0.031027,
147 0.045128,
148 0.121828,
149 0.025627,
150 0.065628,
151 0.100428,
152 0.088428,
153 0.0,
154 0.043228,
155 0.050928,
156 0.029728,
157 0.059128,
158 0.065927,
159 0.050128,
160 0.061828,
161 0.020128,
162 0.029028,
163 ]
164 # equal AA probabilities:
165 self.prob = [
166 0.05,
167 0.05,
168 0.05,
169 0.05,
170 0.05,
171 0.05,
172 0.05,
173 0.05,
174 0.05,
175 0.05,
176 0.05,
177 0.05,
178 0.05,
179 0.05,
180 0.05,
181 0.05,
182 0.05,
183 0.05,
184 0.05,
185 0.05,
186 ]
187 # equal AA probabilities but 0 for Cys and Met:
188 self.prob_randnoCM = [
189 0.05555555555,
190 0.0,
191 0.05555555555,
192 0.05555555555,
193 0.05555555555,
194 0.05555555555,
195 0.05555555555,
196 0.05555555555,
197 0.05555555555,
198 0.05555555555,
199 0.0,
200 0.05555555555,
201 0.05555555555,
202 0.05555555555,
203 0.05555555555,
204 0.05555555555,
205 0.05555555555,
206 0.05555555555,
207 0.05555555555,
208 0.05555555555,
209 ]
210
211 # AA probability from the linear CancerPPD peptides:
212 self.prob_ACP = [
213 0.14526966,
214 0.0,
215 0.00690031,
216 0.00780824,
217 0.06991102,
218 0.04957327,
219 0.01725077,
220 0.05647358,
221 0.27637552,
222 0.17759216,
223 0.00998729,
224 0.00798983,
225 0.01307427,
226 0.00381333,
227 0.02941711,
228 0.02651171,
229 0.0154349,
230 0.04013074,
231 0.0406755,
232 0.00581079,
233 ]
234
235 # AA probabilities for perfect amphipathic helix of different arc sizes
236 self.prob_amphihel = [
237 [
238 0.04545455,
239 0.0,
240 0.04545454,
241 0.04545455,
242 0.0,
243 0.04545455,
244 0.04545455,
245 0.0,
246 0.25,
247 0.0,
248 0.0,
249 0.04545454,
250 0.04545455,
251 0.04545454,
252 0.25,
253 0.04545454,
254 0.04545454,
255 0.0,
256 0.0,
257 0.04545454,
258 ],
259 [
260 0.0,
261 0.0,
262 0.0,
263 0.0,
264 0.16666667,
265 0.0,
266 0.0,
267 0.16666667,
268 0.0,
269 0.16666667,
270 0.0,
271 0.0,
272 0.0,
273 0.0,
274 0.0,
275 0.0,
276 0.0,
277 0.16666667,
278 0.16666667,
279 (1.0 - 0.16666667 * 5),
280 ],
281 ]
282
283 # helical ACP AA probabilities, depending on the position of the AA in the helix.
284 self.prob_ACPhel = np.array(
285 [
286 [
287 0.0483871,
288 0.0,
289 0.0,
290 0.0483871,
291 0.01612903,
292 0.12903226,
293 0.03225807,
294 0.09677419,
295 0.19354839,
296 0.5,
297 0.0483871,
298 0.11290323,
299 0.1,
300 0.18518519,
301 0.07843137,
302 0.12,
303 0.17073172,
304 0.16666667,
305 ],
306 [
307 0.0,
308 0.0,
309 0.0,
310 0.0,
311 0.0,
312 0.0,
313 0.0,
314 0.0,
315 0.0,
316 0.0,
317 0.01612903,
318 0.0,
319 0.0,
320 0.0,
321 0.0,
322 0.0,
323 0.02439024,
324 0.19444444,
325 ],
326 [
327 0.0,
328 0.01612903,
329 0.0,
330 0.27419355,
331 0.01612903,
332 0.0,
333 0.0,
334 0.01612903,
335 0.0,
336 0.0,
337 0.0,
338 0.0,
339 0.0,
340 0.0,
341 0.0,
342 0.0,
343 0.0,
344 0.0,
345 ],
346 [
347 0.0,
348 0.0,
349 0.0,
350 0.0,
351 0.0,
352 0.0,
353 0.0,
354 0.06451613,
355 0.0,
356 0.01612903,
357 0.0483871,
358 0.01612903,
359 0.0,
360 0.01851852,
361 0.0,
362 0.0,
363 0.0,
364 0.0,
365 ],
366 [
367 0.16129032,
368 0.0483871,
369 0.30645161,
370 0.0,
371 0.0483871,
372 0.0,
373 0.0,
374 0.01612903,
375 0.0,
376 0.01612903,
377 0.0,
378 0.09677419,
379 0.06666667,
380 0.01851852,
381 0.0,
382 0.02,
383 0.14634146,
384 0.0,
385 ],
386 [
387 0.64516129,
388 0.0,
389 0.17741936,
390 0.14516129,
391 0.0,
392 0.01612903,
393 0.25806452,
394 0.11290323,
395 0.06451613,
396 0.08064516,
397 0.22580645,
398 0.03225807,
399 0.06666667,
400 0.2037037,
401 0.1372549,
402 0.1,
403 0.0,
404 0.05555556,
405 ],
406 [
407 0.0,
408 0.0,
409 0.0,
410 0.01612903,
411 0.0,
412 0.0,
413 0.01612903,
414 0.0,
415 0.03225807,
416 0.0,
417 0.0,
418 0.20967742,
419 0.0,
420 0.0,
421 0.0,
422 0.16,
423 0.0,
424 0.0,
425 ],
426 [
427 0.0483871,
428 0.11290323,
429 0.01612903,
430 0.08064516,
431 0.33870968,
432 0.27419355,
433 0.0,
434 0.0483871,
435 0.14516129,
436 0.06451613,
437 0.03225807,
438 0.06451613,
439 0.18333333,
440 0.0,
441 0.0,
442 0.1,
443 0.26829268,
444 0.0,
445 ],
446 [
447 0.0,
448 0.03225807,
449 0.01612903,
450 0.12903226,
451 0.12903226,
452 0.0,
453 0.38709677,
454 0.33870968,
455 0.0483871,
456 0.03225807,
457 0.41935484,
458 0.08064516,
459 0.0,
460 0.03703704,
461 0.29411765,
462 0.04,
463 0.02439024,
464 0.02777778,
465 ],
466 [
467 0.0483871,
468 0.70967742,
469 0.12903226,
470 0.0483871,
471 0.09677419,
472 0.32258064,
473 0.20967742,
474 0.06451613,
475 0.11290323,
476 0.06451613,
477 0.03225807,
478 0.03225807,
479 0.28333333,
480 0.24074074,
481 0.03921569,
482 0.28,
483 0.07317073,
484 0.22222222,
485 ],
486 [
487 0.0,
488 0.01612903,
489 0.01612903,
490 0.0483871,
491 0.01612903,
492 0.03225807,
493 0.0,
494 0.0,
495 0.0,
496 0.0,
497 0.0,
498 0.0,
499 0.03333333,
500 0.0,
501 0.01960784,
502 0.02,
503 0.0,
504 0.0,
505 ],
506 [
507 0.0,
508 0.01612903,
509 0.0,
510 0.0,
511 0.0,
512 0.0,
513 0.0,
514 0.0,
515 0.01612903,
516 0.0,
517 0.03225807,
518 0.0,
519 0.0,
520 0.0,
521 0.01960784,
522 0.02,
523 0.0,
524 0.0,
525 ],
526 [
527 0.0,
528 0.0,
529 0.14516129,
530 0.01612903,
531 0.03225807,
532 0.01612903,
533 0.0,
534 0.0,
535 0.0,
536 0.0,
537 0.01612903,
538 0.0,
539 0.0,
540 0.12962963,
541 0.17647059,
542 0.0,
543 0.0,
544 0.0,
545 ],
546 [
547 0.0,
548 0.0,
549 0.01612903,
550 0.01612903,
551 0.0,
552 0.0,
553 0.01612903,
554 0.0,
555 0.01612903,
556 0.0,
557 0.0,
558 0.01612903,
559 0.0,
560 0.01851852,
561 0.0,
562 0.0,
563 0.0,
564 0.0,
565 ],
566 [
567 0.0,
568 0.01612903,
569 0.01612903,
570 0.0,
571 0.01612903,
572 0.0,
573 0.01612903,
574 0.0,
575 0.01612903,
576 0.01612903,
577 0.01612903,
578 0.01612903,
579 0.0,
580 0.01851852,
581 0.01960784,
582 0.0,
583 0.04878049,
584 0.0,
585 ],
586 [
587 0.01612903,
588 0.0,
589 0.01612903,
590 0.12903226,
591 0.03225807,
592 0.03225807,
593 0.0483871,
594 0.17741936,
595 0.0,
596 0.03225807,
597 0.09677419,
598 0.0483871,
599 0.01666667,
600 0.0,
601 0.15686274,
602 0.1,
603 0.0,
604 0.05555556,
605 ],
606 [
607 0.01612903,
608 0.01612903,
609 0.0,
610 0.01612903,
611 0.0483871,
612 0.01612903,
613 0.0,
614 0.01612903,
615 0.0,
616 0.01612903,
617 0.01612903,
618 0.11290323,
619 0.0,
620 0.01851852,
621 0.03921569,
622 0.02,
623 0.0,
624 0.05555556,
625 ],
626 [
627 0.01612903,
628 0.01612903,
629 0.01612903,
630 0.01612903,
631 0.20967742,
632 0.16129032,
633 0.01612903,
634 0.0483871,
635 0.33870968,
636 0.16129032,
637 0.0,
638 0.14516129,
639 0.25,
640 0.11111111,
641 0.01960784,
642 0.02,
643 0.21951219,
644 0.22222222,
645 ],
646 [
647 0.0,
648 0.0,
649 0.12903226,
650 0.01612903,
651 0.0,
652 0.0,
653 0.0,
654 0.0,
655 0.01612903,
656 0.0,
657 0.0,
658 0.0,
659 0.0,
660 0.0,
661 0.0,
662 0.0,
663 0.02439024,
664 0.0,
665 ],
666 [
667 0.0,
668 0.0,
669 0.0,
670 0.0,
671 0.0,
672 0.0,
673 0.0,
674 0.0,
675 0.0,
676 0.0,
677 0.0,
678 0.01612903,
679 0.0,
680 0.0,
681 0.0,
682 0.0,
683 0.0,
684 0.0,
685 ],
686 ]
687 )
688
689 def save_fasta(self, filename, names=False):
690 """Method to save generated sequences in a ``.FASTA`` formatted file.
691
692 :param filename: output filename in which the sequences from :py:attr:`sequences` are safed in fasta format.
693 :param names: {bool} whether sequence names from :py:attr:`names` should be saved as sequence identifiers
694 :return: a FASTA formatted file containing the generated sequences
695 :Example:
696
697 >>> b = BaseSequence(2)
698 >>> b.sequences = ['KLLSLSLALDLLS', 'KLPERTVVNSSDF']
699 >>> b.names = ['Sequence1', 'Sequence2']
700 >>> b.save_fasta('/location/of/fasta/file.fasta', names=True)
701 """
702 if names:
703 save_fasta(filename, self.sequences, self.names)
704 else:
705 save_fasta(filename, self.sequences)
706
707 def mutate_AA(self, nr, prob):
708 """Method to mutate with **prob** probability a **nr** of positions per sequence randomly.
709
710 :param nr: number of mutations to perform per sequence
711 :param prob: probability of mutating a sequence
712 :return: mutated sequences in the attribute :py:attr:`sequences`.
713 :Example:
714
715 >>> b = BaseSequence(1)
716 >>> b.sequences = ['IAKAGRAIIK']
717 >>> b.mutate_AA(3, 1.)
718 >>> b.sequences
719 ['NAKAGRAWIK']
720 """
721 for s in range(len(self.sequences)):
722 # mutate: yes or no? prob = mutation probability
723 mutate = np.random.choice([1, 0], 1, p=[prob, 1 - float(prob)])
724 if mutate == 1:
725 seq = list(self.sequences[s])
726 cnt = 0
727 while cnt < nr: # mutate "nr" AA
728 seq[random.choice(range(len(seq)))] = random.choice(self.AAs)
729 cnt += 1
730 self.sequences[s] = "".join(seq)
731
732 def filter_duplicates(self):
733 """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences`
734
735 :return: filtered sequences list in the attribute :py:attr:`sequences` and corresponding names.
736 :Example:
737
738 >>> b = BaseSequence(4)
739 >>> b.sequences = ['KLLKLLKKLLKLLK', 'KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK', 'KLAKLAKKLAKLAK']
740 >>> b.filter_duplicates()
741 >>> b.sequences
742 ['KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK']
743
744 .. versionadded:: v2.2.5
745 """
746 if not self.names:
747 self.names = ["Seq_" + str(i) for i in range(len(self.sequences))]
748 df = pd.DataFrame(
749 list(zip(self.sequences, self.names)), columns=["Sequences", "Names"]
750 )
751 df = df.drop_duplicates(
752 "Sequences", "first"
753 ) # keep first occurrence of duplicate
754 self.sequences = df["Sequences"].get_values().tolist()
755 self.names = df["Names"].get_values().tolist()
756
757 def keep_natural_aa(self):
758 """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character
759 that is not in ``['A','C','D,'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']``.
760
761 :return: filtered sequence list in the attribute :py:attr:`sequences`. The other attributes are also filtered
762 accordingly (if present).
763 :Example:
764
765 >>> b = BaseSequence(2)
766 >>> b.sequences = ['BBBsdflUasUJfBJ', 'GLFDIVKKVVGALGSL']
767 >>> b.keep_natural_aa()
768 >>> b.sequences
769 ['GLFDIVKKVVGALGSL']
770 """
771 natural_aa = [
772 "A",
773 "C",
774 "D",
775 "E",
776 "F",
777 "G",
778 "H",
779 "I",
780 "K",
781 "L",
782 "M",
783 "N",
784 "P",
785 "Q",
786 "R",
787 "S",
788 "T",
789 "V",
790 "W",
791 "Y",
792 ]
793
794 seqs = []
795 names = []
796
797 for i, s in enumerate(self.sequences):
798 seq = list(s.upper())
799 if all(c in natural_aa for c in seq):
800 seqs.append(s.upper())
801 if hasattr(self, "names") and self.names:
802 names.append(self.names[i])
803
804 self.sequences = seqs
805 self.names = names
806
807 def filter_aa(self, amino_acids):
808 """Method to filter out corresponding names and descriptor values of sequences with given amino acids in the
809 argument list *aminoacids*.
810
811 :param amino_acids: {list} amino acids to be filtered
812 :return: filtered list of sequences names in the corresponding attributes.
813 :Example:
814
815 >>> b = BaseSequence(3)
816 >>> b.sequences = ['AAALLLIIIKKK', 'CCEERRT', 'LLVVIIFFFQQ']
817 >>> b.filter_aa(['C'])
818 >>> b.sequences
819 ['AAALLLIIIKKK', 'LLVVIIFFFQQ']
820 """
821
822 pattern = re.compile("|".join(amino_acids))
823 seqs = []
824 names = []
825
826 for i, s in enumerate(self.sequences):
827 if not pattern.search(s):
828 seqs.append(s)
829 if hasattr(self, "names") and self.names:
830 names.append(self.names[i])
831
832 self.sequences = seqs
833 self.names = names
834
835 def clean(self):
836 """Method to clean / clear / empty the attributes :py:attr:`sequences` and :py:attr:`names`.
837
838 :return: freshly initialized, empty class attributes.
839 """
840 self.__init__(self.seqnum, self.lenmin, self.lenmax)
841
842
843 class BaseDescriptor(object):
844 """
845 Base class inheriting to both peptide descriptor classes :py:class:`modlamp.descriptors.GlobalDescriptor` and
846 :py:class:`modlamp.descriptors.PeptideDescriptor`.
847 """
848
849 def __init__(self, seqs):
850 """
851 :param seqs: a ``.FASTA`` file with sequences, a list / array of sequences or a single sequence as string to
852 calculate the descriptor values for.
853 :return: initialized attributes :py:attr:`sequences` and :py:attr:`names`.
854 :Example:
855
856 >>> AMP = BaseDescriptor('KLLKLLKKLLKLLK','pepCATS')
857 >>> AMP.sequences
858 ['KLLKLLKKLLKLLK']
859 >>> seqs = BaseDescriptor('/Path/to/file.fasta', 'eisenberg') # load sequences from .fasta file
860 >>> seqs.sequences
861 ['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'...]
862 """
863 if type(seqs) == list and seqs[0].isupper():
864 self.sequences = [s.strip() for s in seqs]
865 self.names = []
866 elif type(seqs) == np.ndarray and seqs[0].isupper():
867 self.sequences = [s.strip() for s in seqs.tolist()]
868 self.names = []
869 elif type(seqs) == str and seqs.isupper():
870 self.sequences = [seqs.strip()]
871 self.names = []
872 elif os.path.isfile(seqs):
873 if seqs.endswith(".fasta"): # read .fasta file
874 self.sequences, self.names = read_fasta(seqs)
875 elif seqs.endswith(".csv"): # read .csv file with sequences every line
876 with open(seqs) as f:
877 self.sequences = list()
878 cntr = 0
879 self.names = []
880 for line in f:
881 if line.isupper():
882 self.sequences.append(line.strip())
883 self.names.append("seq_" + str(cntr))
884 cntr += 1
885 else:
886 print("Sorry, currently only .fasta or .csv files can be read!")
887 else:
888 print(
889 "%s does not exist, is not a valid list of AA sequences or is not a valid sequence string"
890 % seqs
891 )
892
893 self.descriptor = np.array([[]])
894 self.target = np.array([], dtype="int")
895 self.scaler = None
896 self.featurenames = []
897
898 def read_fasta(self, filename):
899 """Method for loading sequences from a ``.FASTA`` formatted file into the attributes :py:attr:`sequences` and
900 :py:attr:`names`.
901
902 :param filename: {str} ``.FASTA`` file with sequences and headers to read
903 :return: {list} sequences in the attribute :py:attr:`sequences` with corresponding sequence names in
904 :py:attr:`names`.
905 """
906 self.sequences, self.names = read_fasta(filename)
907
908 def save_fasta(self, filename, names=False):
909 """Method for saving sequences from :py:attr:`sequences` to a ``.FASTA`` formatted file.
910
911 :param filename: {str} filename of the output ``.FASTA`` file
912 :param names: {bool} whether sequence names from self.names should be saved as sequence identifiers
913 :return: a FASTA formatted file containing the generated sequences
914 """
915 if names:
916 save_fasta(filename, self.sequences, self.names)
917 else:
918 save_fasta(filename, self.sequences)
919
920 def count_aa(self, scale="relative", average=False, append=False):
921 """Method for producing the amino acid distribution for the given sequences as a descriptor
922
923 :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA
924 :param average: {boolean} whether the averaged amino acid counts for all sequences should be returned
925 :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
926 attribute :py:attr:`descriptor`.
927 :return: the amino acid distributions for every sequence individually in the attribute :py:attr:`descriptor`
928 :Example:
929
930 >>> AMP = PeptideDescriptor('ACDEFGHIKLMNPQRSTVWY') # aa_count() does not depend on the descriptor scale
931 >>> AMP.count_aa()
932 >>> AMP.descriptor
933 array([[ 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, ... ]])
934 >>> AMP.descriptor.shape
935 (1, 20)
936
937 .. seealso:: :py:func:`modlamp.core.count_aa()`
938 """
939 desc = list()
940 for seq in self.sequences:
941 od = count_aas(seq, scale)
942 desc.append(list(od.values()))
943
944 desc = np.array(desc)
945 self.featurenames = list(od.keys())
946
947 if append:
948 self.descriptor = np.hstack((self.descriptor, desc))
949 elif average:
950 self.descriptor = np.mean(desc, axis=0)
951 else:
952 self.descriptor = desc
953
954 def count_ngrams(self, n):
955 """Method for producing n-grams of all sequences in self.sequences
956
957 :param n: {int or list of ints} defines whether counts or frequencies are given for each AA
958 :return: {dict} dictionary with n-grams as keys and their counts in the sequence as values in :py:attr:`descriptor`
959 :Example:
960
961 >>> D = PeptideDescriptor('GLLDFLSLAALSLDKLVKKGALS')
962 >>> D.count_ngrams([2, 3])
963 >>> D.descriptor
964 {'LS': 3, 'LD': 2, 'LSL': 2, 'AL': 2, ..., 'LVK': 1}
965
966 .. seealso:: :py:func:`modlamp.core.count_ngrams()`
967 """
968 ngrams = dict()
969 for seq in self.sequences:
970 d = count_ngrams(seq, n)
971 for k, v in d.items():
972 if k in ngrams.keys():
973 ngrams[k] += v
974 else:
975 ngrams[k] = v
976 self.descriptor = ngrams
977
978 def feature_scaling(self, stype="standard", fit=True):
979 """Method for feature scaling of the calculated descriptor matrix.
980
981 :param stype: {'standard' or 'minmax'} type of scaling to be used
982 :param fit: {boolean} defines whether the used scaler is first fitting on the data (True) or
983 whether the already fitted scaler in :py:attr:`scaler` should be used to transform (False).
984 :return: scaled descriptor values in :py:attr:`descriptor`
985 :Example:
986
987 >>> D.descriptor
988 array([[0.155],[0.34],[0.16235294],[-0.08842105],[0.116]])
989 >>> D.feature_scaling(type='minmax',fit=True)
990 array([[0.56818182],[1.],[0.5853447],[0.],[0.47714988]])
991 """
992 if stype in ["standard", "minmax"]:
993 if stype == "standard":
994 self.scaler = StandardScaler()
995 elif stype == "minmax":
996 self.scaler = MinMaxScaler()
997
998 if fit:
999 self.descriptor = self.scaler.fit_transform(self.descriptor)
1000 else:
1001 self.descriptor = self.scaler.transform(self.descriptor)
1002 else:
1003 print("Unknown scaler type!\nAvailable: 'standard', 'minmax'")
1004
1005 def feature_shuffle(self):
1006 """Method for shuffling feature columns randomly.
1007
1008 :return: descriptor matrix with shuffled feature columns in :py:attr:`descriptor`
1009 :Example:
1010
1011 >>> D.descriptor
1012 array([[0.80685625,167.05234375,39.56818125,-0.26338667,155.16888667,33.48778]])
1013 >>> D.feature_shuffle()
1014 array([[155.16888667,-0.26338667,167.05234375,0.80685625,39.56818125,33.48778]])
1015 """
1016 self.descriptor = shuffle(self.descriptor.transpose()).transpose()
1017
1018 def sequence_order_shuffle(self):
1019 """Method for shuffling sequence order in the attribute :py:attr:`sequences`.
1020
1021 :return: sequences in :py:attr:`sequences` with shuffled order in the list.
1022 :Example:
1023
1024 >>> D.sequences
1025 ['LILRALKGAARALKVA','VKIAKIALKIIKGLG','VGVRLIKGIGRVARGAI','LRGLRGVIRGGKAIVRVGK','GGKLVRLIARIGKGV']
1026 >>> D.sequence_order_shuffle()
1027 >>> D.sequences
1028 ['VGVRLIKGIGRVARGAI','LILRALKGAARALKVA','LRGLRGVIRGGKAIVRVGK','GGKLVRLIARIGKGV','VKIAKIALKIIKGLG']
1029 """
1030 self.sequences = shuffle(self.sequences)
1031
1032 def random_selection(self, num):
1033 """Method to randomly select a specified number of sequences (with names and descriptors if present) out of a given
1034 descriptor instance.
1035
1036 :param num: {int} number of entries to be randomly selected
1037 :return: updated instance
1038 :Example:
1039
1040 >>> h = Helices(7, 28, 100)
1041 >>> h.generate_helices()
1042 >>> desc = PeptideDescriptor(h.sequences, 'eisenberg')
1043 >>> desc.calculate_moment()
1044 >>> len(desc.sequences)
1045 100
1046 >>> len(desc.descriptor)
1047 100
1048 >>> desc.random_selection(10)
1049 >>> len(desc.descriptor)
1050 10
1051 >>> len(desc.descriptor)
1052 10
1053
1054 .. versionadded:: v2.2.3
1055 """
1056
1057 sel = np.random.choice(len(self.sequences), size=num, replace=False)
1058 self.sequences = np.array(self.sequences)[sel].tolist()
1059 if hasattr(self, "descriptor") and self.descriptor.size:
1060 self.descriptor = self.descriptor[sel]
1061 if hasattr(self, "names") and self.names:
1062 self.names = np.array(self.names)[sel].tolist()
1063 if hasattr(self, "target") and self.target.size:
1064 self.target = self.target[sel]
1065
1066 def minmax_selection(self, iterations, distmetric="euclidean", seed=0):
1067 """Method to select a specified number of sequences according to the minmax algorithm.
1068
1069 :param iterations: {int} Number of sequences to retrieve.
1070 :param distmetric: Distance metric to calculate the distances between the sequences in descriptor space.
1071 Choose from 'euclidean' or 'minkowsky'.
1072 :param seed: {int} Set a random seed for numpy to pick the first sequence.
1073 :return: updated instance
1074
1075 .. seealso:: **SciPy** http://docs.scipy.org/doc/scipy/reference/spatial.distance.html
1076 """
1077
1078 # Storing M into pool, where selections get deleted
1079 pool = self.descriptor # Store pool where selections get deleted
1080 minmaxidx = list() # Store original indices of selections to return
1081
1082 # Randomly selecting first peptide into the sele
1083 np.random.seed(seed)
1084 idx = int(np.random.random_integers(0, len(pool), 1))
1085 sele = pool[idx : idx + 1, :]
1086 minmaxidx.append(
1087 int(*np.where(np.all(self.descriptor == pool[idx : idx + 1, :], axis=1)))
1088 )
1089
1090 # Deleting peptide in selection from pool
1091 pool = np.delete(pool, idx, axis=0)
1092
1093 for i in range(iterations - 1):
1094 # Calculating distance from sele to the rest of the peptides
1095 dist = distance.cdist(pool, sele, distmetric)
1096
1097 # Choosing maximal distances for every sele instance
1098 maxidx = np.argmax(dist, axis=0)
1099 maxcols = np.max(dist, axis=0)
1100
1101 # Choosing minimal distance among the maximal distances
1102 minmax = np.argmin(maxcols)
1103 maxidx = int(maxidx[minmax])
1104
1105 # Adding it to selection and removing from pool
1106 sele = np.append(sele, pool[maxidx : maxidx + 1, :], axis=0)
1107 pool = np.delete(pool, maxidx, axis=0)
1108 minmaxidx.append(
1109 int(
1110 *np.where(
1111 np.all(self.descriptor == pool[maxidx : maxidx + 1, :], axis=1)
1112 )
1113 )
1114 )
1115
1116 self.sequences = np.array(self.sequences)[minmaxidx].tolist()
1117 if hasattr(self, "descriptor") and self.descriptor.size:
1118 self.descriptor = self.descriptor[minmaxidx]
1119 if hasattr(self, "names") and self.names:
1120 self.names = np.array(self.names)[minmaxidx].tolist()
1121 if hasattr(self, "target") and self.target.size:
1122 self.target = self.descriptor[minmaxidx]
1123
1124 def filter_sequences(self, sequences):
1125 """Method to filter out entries for given sequences in *sequences* out of a descriptor instance. All
1126 corresponding attribute values of these sequences (e.g. in :py:attr:`descriptor`, :py:attr:`name`) are deleted
1127 as well. The method returns an updated descriptor instance.
1128
1129 :param sequences: {list} sequences to be filtered out of the whole instance, including corresponding data
1130 :return: updated instance without filtered sequences
1131 :Example:
1132
1133 >>> sequences = ['KLLKLLKKLLKLLK', 'ACDEFGHIK', 'GLFDIVKKVV', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALGSL']
1134 >>> desc = PeptideDescriptor(sequences, 'pepcats')
1135 >>> desc.calculate_crosscorr(7)
1136 >>> len(desc.descriptor)
1137 5
1138 >>> desc.filter_sequences('KLLKLLKKLLKLLK')
1139 >>> len(desc.descriptor)
1140 4
1141 >>> desc.sequences
1142 ['ACDEFGHIK', 'GLFDIVKKVV', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALGSL']
1143 """
1144 indices = list()
1145 if isinstance(
1146 sequences, str
1147 ): # check if sequences is only one sequence string and convert it to a list
1148 sequences = [sequences]
1149 for s in sequences: # get indices of queried sequences
1150 indices.append(self.sequences.index(s))
1151
1152 self.sequences = np.delete(np.array(self.sequences), indices, 0).tolist()
1153 if hasattr(self, "descriptor") and self.descriptor.size:
1154 self.descriptor = np.delete(self.descriptor, indices, 0)
1155 if hasattr(self, "names") and self.names:
1156 self.names = np.delete(np.array(self.names), indices, 0).tolist()
1157 if hasattr(self, "target") and self.target.size:
1158 self.target = np.delete(self.target, indices, 0)
1159
1160 def filter_values(self, values, operator="=="):
1161 """Method to filter the descriptor matrix in the attribute :py:attr:`descriptor` for a given list of values (same
1162 size as the number of features in the descriptor matrix!) The operator option tells the method whether to
1163 filter for values equal, lower, higher ect. to the given values in the *values* array.
1164
1165 :param values: {list} values to filter the attribute :py:attr:`descriptor` for
1166 :param operator: {str} filter criterion, available the operators ``==``, ``<``, ``>``, ``<=``and ``>=``.
1167 :return: descriptor matrix and updated sequences containing only entries with descriptor values given in
1168 *values* in the corresponding attributes.
1169 :Example:
1170
1171 >>> desc.descriptor # desc = BaseDescriptor instance
1172 array([[ 0.7666517 ],
1173 [ 0.38373498]])
1174 >>> desc.filter_values([0.5], '<')
1175 >>> desc.descriptor
1176 array([[ 0.38373498]])
1177 """
1178 dim = self.descriptor.shape[1]
1179 for d in range(dim): # for all the features in self.descriptor
1180 if operator == "==":
1181 indices = np.where(self.descriptor[:, d] == values[d])[0]
1182 elif operator == "<":
1183 indices = np.where(self.descriptor[:, d] < values[d])[0]
1184 elif operator == ">":
1185 indices = np.where(self.descriptor[:, d] > values[d])[0]
1186 elif operator == "<=":
1187 indices = np.where(self.descriptor[:, d] <= values[d])[0]
1188 elif operator == ">=":
1189 indices = np.where(self.descriptor[:, d] >= values[d])[0]
1190 else:
1191 raise KeyError(
1192 "available operators: ``==``, ``<``, ``>``, ``<=``and ``>=``"
1193 )
1194
1195 # filter descriptor matrix, sequence list and names list according to obtained indices
1196 self.sequences = np.array(self.sequences)[indices].tolist()
1197 if hasattr(self, "descriptor") and self.descriptor.size:
1198 self.descriptor = self.descriptor[indices]
1199 if hasattr(self, "names") and self.names:
1200 self.names = np.array(self.names)[indices].tolist()
1201 if hasattr(self, "target") and self.target.size:
1202 self.target = self.target[indices]
1203
1204 def filter_aa(self, amino_acids):
1205 """Method to filter out corresponding names and descriptor values of sequences with given amino acids in the
1206 argument list *aminoacids*.
1207
1208 :param amino_acids: list of amino acids to be filtered
1209 :return: filtered list of sequences, descriptor values, target values and names in the corresponding attributes.
1210 :Example:
1211
1212 >>> b = BaseSequence(3)
1213 >>> b.sequences = ['AAALLLIIIKKK', 'CCEERRT', 'LLVVIIFFFQQ']
1214 >>> b.filter_aa(['C'])
1215 >>> b.sequences
1216 ['AAALLLIIIKKK', 'LLVVIIFFFQQ']
1217 """
1218
1219 pattern = re.compile("|".join(amino_acids))
1220 seqs = []
1221 desc = []
1222 names = []
1223 target = []
1224
1225 for i, s in enumerate(self.sequences):
1226 if not pattern.search(s):
1227 seqs.append(s)
1228 if hasattr(self, "descriptor") and self.descriptor.size:
1229 desc.append(self.descriptor[i])
1230 if hasattr(self, "names") and self.names:
1231 names.append(self.names[i])
1232 if hasattr(self, "target") and self.target.size:
1233 target.append(self.target[i])
1234
1235 self.sequences = seqs
1236 self.names = names
1237 self.descriptor = np.array(desc)
1238 self.target = np.array(target, dtype="int")
1239
1240 def filter_duplicates(self):
1241 """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences`
1242
1243 :return: filtered sequences list in the attribute :py:attr:`sequences` and corresponding names.
1244 :Example:
1245
1246 >>> b = BaseDescriptor(['KLLKLLKKLLKLLK', 'KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK', 'KLAKLAKKLAKLAK'])
1247 >>> b.filter_duplicates()
1248 >>> b.sequences
1249 ['KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK']
1250
1251 .. versionadded:: v2.2.5
1252 """
1253 if not self.names:
1254 self.names = ["Seq_" + str(i) for i in range(len(self.sequences))]
1255 if not self.target:
1256 self.target = [0] * len(self.sequences)
1257 if not self.descriptor:
1258 self.descriptor = np.zeros(len(self.sequences))
1259 df = pd.DataFrame(
1260 np.array([self.sequences, self.names, self.descriptor, self.target]).T,
1261 columns=["Sequences", "Names", "Descriptor", "Target"],
1262 )
1263 df = df.drop_duplicates(
1264 "Sequences", "first"
1265 ) # keep first occurrence of duplicate
1266 self.sequences = df["Sequences"].get_values().tolist()
1267 self.names = df["Names"].get_values().tolist()
1268 self.descriptor = df["Descriptor"].get_values()
1269 self.target = df["Target"].get_values()
1270
1271 def keep_natural_aa(self):
1272 """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character
1273 that is not in ['A','C','D,'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'].
1274
1275 :return: filtered sequence list in the attribute :py:attr:`sequences`. The other attributes are also filtered
1276 accordingly (if present).
1277 :Example:
1278
1279 >>> b = BaseSequence(2)
1280 >>> b.sequences = ['BBBsdflUasUJfBJ', 'GLFDIVKKVVGALGSL']
1281 >>> b.keep_natural_aa()
1282 >>> b.sequences
1283 ['GLFDIVKKVVGALGSL']
1284 """
1285
1286 natural_aa = [
1287 "A",
1288 "C",
1289 "D",
1290 "E",
1291 "F",
1292 "G",
1293 "H",
1294 "I",
1295 "K",
1296 "L",
1297 "M",
1298 "N",
1299 "P",
1300 "Q",
1301 "R",
1302 "S",
1303 "T",
1304 "V",
1305 "W",
1306 "Y",
1307 ]
1308
1309 seqs = []
1310 desc = []
1311 names = []
1312 target = []
1313
1314 for i, s in enumerate(self.sequences):
1315 seq = list(s.upper())
1316 if all(c in natural_aa for c in seq):
1317 seqs.append(s.upper())
1318 if hasattr(self, "descriptor") and self.descriptor.size:
1319 desc.append(self.descriptor[i])
1320 if hasattr(self, "names") and self.names:
1321 names.append(self.names[i])
1322 if hasattr(self, "target") and self.target.size:
1323 target.append(self.target[i])
1324
1325 self.sequences = seqs
1326 self.names = names
1327 self.descriptor = np.array(desc)
1328 self.target = np.array(target, dtype="int")
1329
1330 def load_descriptordata(
1331 self, filename, delimiter=",", targets=False, skip_header=0
1332 ):
1333 """Method to load any data file with sequences and descriptor values and save it to a new insatnce of the
1334 class :class:`modlamp.descriptors.PeptideDescriptor`.
1335
1336 .. note:: Headers are not considered. To skip initial lines in the file, use the *skip_header* option.
1337
1338 :param filename: {str} filename of the data file to be loaded
1339 :param delimiter: {str} column delimiter
1340 :param targets: {boolean} whether last column in the file contains a target class vector
1341 :param skip_header: {int} number of initial lines to skip in the file
1342 :return: loaded sequences, descriptor values and targets in the corresponding attributes.
1343 """
1344 data = np.genfromtxt(filename, delimiter=delimiter, skip_header=skip_header)
1345 data = data[:, 1:] # skip sequences as they are "nan" when read as float
1346 seqs = np.genfromtxt(filename, delimiter=delimiter, dtype="str")
1347 seqs = seqs[:, 0]
1348 if targets:
1349 self.target = np.array(data[:, -1], dtype="int")
1350 self.sequences = seqs
1351 self.descriptor = data
1352
1353 def save_descriptor(self, filename, delimiter=",", targets=None, header=None):
1354 """Method to save the descriptor values to a .csv/.txt file
1355
1356 :param filename: filename of the output file
1357 :param delimiter: column delimiter
1358 :param targets: target class vector to be added to descriptor (same length as :py:attr:`sequences`)
1359 :param header: {str} header to be written at the beginning of the file (if ``None``: feature names are taken)
1360 :return: output file with peptide names and descriptor values
1361 """
1362 seqs = np.array(self.sequences, dtype="|S80")[:, np.newaxis]
1363 ids = np.array(self.names, dtype="|S80")[:, np.newaxis]
1364 if ids.shape == seqs.shape:
1365 names = np.hstack((ids, seqs))
1366 else:
1367 names = seqs
1368 if targets and len(targets) == len(self.sequences):
1369 target = np.array(targets)[:, np.newaxis]
1370 data = np.hstack((names, self.descriptor, target))
1371 else:
1372 data = np.hstack((names, self.descriptor))
1373 if not header:
1374 featurenames = [["Sequence"]] + self.featurenames
1375 header = ", ".join([f[0] for f in featurenames])
1376 np.savetxt(filename, data, delimiter=delimiter, fmt="%s", header=header)
1377
1378
1379 def load_scale(scalename):
1380 """Method to load scale values for a given amino acid scale
1381
1382 :param scalename: amino acid scale name, for available scales see the
1383 :class:`modlamp.descriptors.PeptideDescriptor()` documentation.
1384 :return: amino acid scale values in dictionary format.
1385 """
1386 # predefined amino acid scales dictionary
1387 scales = {
1388 "aasi": {
1389 "A": [1.89],
1390 "C": [1.73],
1391 "D": [3.13],
1392 "E": [3.14],
1393 "F": [1.53],
1394 "G": [2.67],
1395 "H": [3],
1396 "I": [1.97],
1397 "K": [2.28],
1398 "L": [1.74],
1399 "M": [2.5],
1400 "N": [2.33],
1401 "P": [0.22],
1402 "Q": [3.05],
1403 "R": [1.91],
1404 "S": [2.14],
1405 "T": [2.18],
1406 "V": [2.37],
1407 "W": [2],
1408 "Y": [2.01],
1409 },
1410 "abhprk": {
1411 "A": [0, 0, 0, 0, 0, 0],
1412 "C": [0, 0, 0, 0, 0, 0],
1413 "D": [1, 0, 0, 1, 0, 0],
1414 "E": [1, 0, 0, 1, 0, 0],
1415 "F": [0, 0, 1, 0, 1, 0],
1416 "G": [0, 0, 0, 0, 0, 0],
1417 "H": [0, 0, 0, 1, 1, 0],
1418 "I": [0, 0, 1, 0, 0, 0],
1419 "K": [0, 1, 0, 1, 0, 0],
1420 "L": [0, 0, 1, 0, 0, 0],
1421 "M": [0, 0, 1, 0, 0, 0],
1422 "N": [0, 0, 0, 1, 0, 0],
1423 "P": [0, 0, 0, 0, 0, 1],
1424 "Q": [0, 0, 0, 1, 0, 0],
1425 "R": [0, 1, 0, 1, 0, 0],
1426 "S": [0, 0, 0, 1, 0, 0],
1427 "T": [0, 0, 0, 1, 0, 0],
1428 "V": [0, 0, 1, 0, 0, 0],
1429 "W": [0, 0, 1, 0, 1, 0],
1430 "Y": [0, 0, 0, 1, 1, 0],
1431 },
1432 "argos": {
1433 "I": [0.77],
1434 "F": [1.2],
1435 "V": [0.14],
1436 "L": [2.3],
1437 "W": [0.07],
1438 "M": [2.3],
1439 "A": [0.64],
1440 "G": [-0.48],
1441 "C": [0.25],
1442 "Y": [-0.41],
1443 "P": [-0.31],
1444 "T": [-0.13],
1445 "S": [-0.25],
1446 "H": [-0.87],
1447 "E": [-0.94],
1448 "N": [-0.89],
1449 "Q": [-0.61],
1450 "D": [-1],
1451 "K": [-1],
1452 "R": [-0.68],
1453 },
1454 "bulkiness": {
1455 "A": [0.443],
1456 "C": [0.551],
1457 "D": [0.453],
1458 "E": [0.557],
1459 "F": [0.898],
1460 "G": [0],
1461 "H": [0.563],
1462 "I": [0.985],
1463 "K": [0.674],
1464 "L": [0.985],
1465 "M": [0.703],
1466 "N": [0.516],
1467 "P": [0.768],
1468 "Q": [0.605],
1469 "R": [0.596],
1470 "S": [0.332],
1471 "T": [0.677],
1472 "V": [0.995],
1473 "W": [1],
1474 "Y": [0.801],
1475 },
1476 "charge_phys": {
1477 "A": [0.0],
1478 "C": [-0.1],
1479 "D": [-1.0],
1480 "E": [-1.0],
1481 "F": [0.0],
1482 "G": [0.0],
1483 "H": [0.1],
1484 "I": [0.0],
1485 "K": [1.0],
1486 "L": [0.0],
1487 "M": [0.0],
1488 "N": [0.0],
1489 "P": [0.0],
1490 "Q": [0.0],
1491 "R": [1.0],
1492 "S": [0.0],
1493 "T": [0.0],
1494 "V": [0.0],
1495 "W": [0.0],
1496 "Y": [0.0],
1497 },
1498 "charge_acid": {
1499 "A": [0.0],
1500 "C": [-0.1],
1501 "D": [-1.0],
1502 "E": [-1.0],
1503 "F": [0.0],
1504 "G": [0.0],
1505 "H": [1.0],
1506 "I": [0.0],
1507 "K": [1.0],
1508 "L": [0.0],
1509 "M": [0.0],
1510 "N": [0.0],
1511 "P": [0.0],
1512 "Q": [0.0],
1513 "R": [1.0],
1514 "S": [0.0],
1515 "T": [0.0],
1516 "V": [0.0],
1517 "W": [0.0],
1518 "Y": [0.0],
1519 },
1520 "cougar": {
1521 "A": [0.25, 0.62, 1.89],
1522 "C": [0.208, 0.29, 1.73],
1523 "D": [0.875, -0.9, 3.13],
1524 "E": [0.833, -0.74, 3.14],
1525 "F": [0.042, 1.2, 1.53],
1526 "G": [1, 0.48, 2.67],
1527 "H": [0.083, -0.4, 3],
1528 "I": [0.667, 1.4, 1.97],
1529 "K": [0.708, -1.5, 2.28],
1530 "L": [0.292, 1.1, 1.74],
1531 "M": [0, 0.64, 2.5],
1532 "N": [0.667, -0.78, 2.33],
1533 "P": [0.875, 0.12, 0.22],
1534 "Q": [0.792, -0.85, 3.05],
1535 "R": [0.958, -2.5, 1.91],
1536 "S": [0.875, -0.18, 2.14],
1537 "T": [0.583, -0.05, 2.18],
1538 "V": [0.375, 1.1, 2.37],
1539 "W": [0.042, 0.81, 2],
1540 "Y": [0.5, 0.26, 2.01],
1541 },
1542 "eisenberg": {
1543 "I": [1.4],
1544 "F": [1.2],
1545 "V": [1.1],
1546 "L": [1.1],
1547 "W": [0.81],
1548 "M": [0.64],
1549 "A": [0.62],
1550 "G": [0.48],
1551 "C": [0.29],
1552 "Y": [0.26],
1553 "P": [0.12],
1554 "T": [-0.05],
1555 "S": [-0.18],
1556 "H": [-0.4],
1557 "E": [-0.74],
1558 "N": [-0.78],
1559 "Q": [-0.85],
1560 "D": [-0.9],
1561 "K": [-1.5],
1562 "R": [-2.5],
1563 },
1564 "ez": {
1565 "A": [-0.29, 10.22, 4.67],
1566 "C": [0.95, 13.69, 5.77],
1567 "D": [1.19, 14.25, 8.98],
1568 "E": [1.3, 14.66, 4.16],
1569 "F": [-0.8, 19.67, 7.12],
1570 "G": [-0.01, 13.86, 6],
1571 "H": [0.75, 12.26, 2.77],
1572 "I": [-0.56, 14.34, 10.69],
1573 "K": [1.66, 11.11, 2.09],
1574 "L": [-0.64, 17.34, 8.61],
1575 "M": [-0.28, 18.04, 7.13],
1576 "N": [0.89, 12.78, 6.28],
1577 "P": [0.83, 18.09, 3.53],
1578 "Q": [1.21, 10.46, 2.59],
1579 "R": [1.55, 9.34, 4.68],
1580 "S": [0.1, 13.86, 6],
1581 "T": [0.01, 13.86, 6],
1582 "V": [-0.47, 11.35, 4.97],
1583 "W": [-0.85, 11.65, 7.2],
1584 "Y": [-0.42, 13.04, 6.2],
1585 },
1586 "flexibility": {
1587 "A": [0.25],
1588 "C": [0.208],
1589 "D": [0.875],
1590 "E": [0.833],
1591 "F": [0.042],
1592 "G": [1],
1593 "H": [0.083],
1594 "I": [0.667],
1595 "K": [0.708],
1596 "L": [0.292],
1597 "M": [0.0],
1598 "N": [0.667],
1599 "P": [0.875],
1600 "Q": [0.792],
1601 "R": [0.958],
1602 "S": [0.875],
1603 "T": [0.583],
1604 "V": [0.375],
1605 "W": [0.042],
1606 "Y": [0.5],
1607 },
1608 "grantham": {
1609 "A": [0, 8.1, 31],
1610 "C": [2.75, 5.5, 55],
1611 "D": [1.38, 13.0, 54],
1612 "E": [0.92, 12.3, 83],
1613 "F": [0, 5.2, 132],
1614 "G": [0.74, 9.0, 3],
1615 "H": [0.58, 10.4, 96],
1616 "I": [0, 5.2, 111],
1617 "K": [0.33, 11.3, 119],
1618 "L": [0, 4.9, 111],
1619 "M": [0, 5.7, 105],
1620 "N": [1.33, 11.6, 56],
1621 "P": [0.39, 8.0, 32.5],
1622 "Q": [0.89, 10.5, 85],
1623 "R": [0.65, 10.5, 124],
1624 "S": [1.42, 9.2, 32],
1625 "T": [0.71, 8.6, 61],
1626 "V": [0, 5.9, 84],
1627 "W": [0.13, 5.4, 170],
1628 "Y": [0.20, 6.2, 136],
1629 },
1630 "gravy": {
1631 "I": [4.5],
1632 "V": [4.2],
1633 "L": [3.8],
1634 "F": [2.8],
1635 "C": [2.5],
1636 "M": [1.9],
1637 "A": [1.8],
1638 "G": [-0.4],
1639 "T": [-0.7],
1640 "W": [-0.9],
1641 "S": [-0.8],
1642 "Y": [-1.3],
1643 "P": [-1.6],
1644 "H": [-3.2],
1645 "E": [-3.5],
1646 "Q": [-3.5],
1647 "D": [-3.5],
1648 "N": [-3.5],
1649 "K": [-3.9],
1650 "R": [-4.5],
1651 },
1652 "hopp-woods": {
1653 "A": [-0.5],
1654 "C": [-1],
1655 "D": [3],
1656 "E": [3],
1657 "F": [-2.5],
1658 "G": [0],
1659 "H": [-0.5],
1660 "I": [-1.8],
1661 "K": [3],
1662 "L": [-1.8],
1663 "M": [-1.3],
1664 "N": [0.2],
1665 "P": [0],
1666 "Q": [0.2],
1667 "R": [3],
1668 "S": [0.3],
1669 "T": [-0.4],
1670 "V": [-1.5],
1671 "W": [-3.4],
1672 "Y": [-2.3],
1673 },
1674 "isaeci": {
1675 "A": [62.9, 0.05],
1676 "C": [78.51, 0.15],
1677 "D": [18.46, 1.25],
1678 "E": [30.19, 1.31],
1679 "F": [189.42, 0.14],
1680 "G": [19.93, 0.02],
1681 "H": [87.38, 0.56],
1682 "I": [149.77, 0.09],
1683 "K": [102.78, 0.53],
1684 "L": [154.35, 0.1],
1685 "M": [132.22, 0.34],
1686 "N": [19.53, 1.36],
1687 "P": [122.35, 0.16],
1688 "Q": [17.87, 1.31],
1689 "R": [52.98, 1.69],
1690 "S": [19.75, 0.56],
1691 "T": [59.44, 0.65],
1692 "V": [120.91, 0.07],
1693 "W": [179.16, 1.08],
1694 "Y": [132.16, 0.72],
1695 },
1696 "janin": {
1697 "I": [1.2],
1698 "F": [0.87],
1699 "V": [1],
1700 "L": [0.87],
1701 "W": [0.59],
1702 "M": [0.73],
1703 "A": [0.59],
1704 "G": [0.59],
1705 "C": [1.4],
1706 "Y": [-0.4],
1707 "P": [-0.26],
1708 "T": [-0.12],
1709 "S": [0.02],
1710 "H": [0.02],
1711 "E": [-0.83],
1712 "N": [-0.55],
1713 "Q": [-0.83],
1714 "D": [-0.69],
1715 "K": [-2.4],
1716 "R": [-1.8],
1717 },
1718 "kytedoolittle": {
1719 "I": [1.7],
1720 "F": [1.1],
1721 "V": [1.6],
1722 "L": [1.4],
1723 "W": [-0.14],
1724 "M": [0.8],
1725 "A": [0.77],
1726 "G": [0.03],
1727 "C": [1],
1728 "Y": [-0.27],
1729 "P": [-0.37],
1730 "T": [-0.07],
1731 "S": [-0.1],
1732 "H": [-0.91],
1733 "E": [-1],
1734 "N": [-1],
1735 "Q": [-1],
1736 "D": [-1],
1737 "K": [-1.1],
1738 "R": [-1.3],
1739 },
1740 "levitt_alpha": {
1741 "A": [1.29],
1742 "C": [1.11],
1743 "D": [1.04],
1744 "E": [1.44],
1745 "F": [1.07],
1746 "G": [0.56],
1747 "H": [1.22],
1748 "I": [0.97],
1749 "K": [1.23],
1750 "L": [1.3],
1751 "M": [1.47],
1752 "N": [0.9],
1753 "P": [0.52],
1754 "Q": [1.27],
1755 "R": [0.96],
1756 "S": [0.82],
1757 "T": [0.82],
1758 "V": [0.91],
1759 "W": [0.99],
1760 "Y": [0.72],
1761 },
1762 "mss": {
1763 "A": [13.02],
1764 "C": [23.7067],
1765 "D": [22.02],
1766 "E": [20.0233],
1767 "F": [23.5288],
1768 "G": [1.01],
1769 "H": [23.5283],
1770 "I": [22.3611],
1771 "K": [18.9756],
1772 "L": [19.6944],
1773 "M": [21.92],
1774 "N": [21.8567],
1775 "P": [19.0242],
1776 "Q": [19.9689],
1777 "R": [19.0434],
1778 "S": [18.3533],
1779 "T": [22.3567],
1780 "V": [21.0267],
1781 "W": [26.1975],
1782 "Y": [24.1954],
1783 },
1784 "msw": {
1785 "A": [-0.73, 0.2, -0.62],
1786 "C": [-0.66, 0.26, -0.27],
1787 "D": [0.11, -1, -0.96],
1788 "E": [0.24, -0.39, -0.04],
1789 "F": [0.76, 0.85, -0.34],
1790 "G": [-0.31, -0.28, -0.75],
1791 "H": [0.84, 0.67, -0.78],
1792 "I": [-0.91, 0.83, -0.25],
1793 "K": [-0.51, 0.08, 0.6],
1794 "L": [-0.74, 0.72, -0.16],
1795 "M": [-0.7, 1, -0.32],
1796 "N": [0.14, 0.2, -0.66],
1797 "P": [-0.43, 0.73, -0.6],
1798 "Q": [0.3, 1, -0.3],
1799 "R": [-0.22, 0.27, 1],
1800 "S": [-0.8, 0.61, -1],
1801 "T": [-0.58, 0.85, -0.89],
1802 "V": [-1, 0.79, -0.58],
1803 "W": [1, 0.98, -0.47],
1804 "Y": [0.97, 0.66, -0.16],
1805 },
1806 "pepcats": {
1807 "A": [1, 0, 0, 0, 0, 0],
1808 "C": [1, 0, 1, 1, 0, 0],
1809 "D": [0, 0, 1, 0, 0, 1],
1810 "E": [0, 0, 1, 0, 0, 1],
1811 "F": [1, 1, 0, 0, 0, 0],
1812 "G": [0, 0, 0, 0, 0, 0],
1813 "H": [1, 1, 0, 1, 1, 0],
1814 "I": [1, 0, 0, 0, 0, 0],
1815 "K": [1, 0, 0, 1, 1, 0],
1816 "L": [1, 0, 0, 0, 0, 0],
1817 "M": [1, 0, 1, 0, 0, 0],
1818 "N": [0, 0, 1, 1, 0, 0],
1819 "P": [1, 0, 0, 0, 0, 0],
1820 "Q": [0, 0, 1, 1, 0, 0],
1821 "R": [1, 0, 0, 1, 1, 0],
1822 "S": [0, 0, 1, 1, 0, 0],
1823 "T": [0, 0, 1, 1, 0, 0],
1824 "V": [1, 0, 0, 0, 0, 0],
1825 "W": [1, 1, 0, 1, 0, 0],
1826 "Y": [1, 1, 1, 1, 0, 0],
1827 },
1828 "peparc": {
1829 "A": [1, 0, 0, 0, 0],
1830 "C": [0, 1, 0, 0, 0],
1831 "D": [0, 1, 0, 1, 0],
1832 "E": [0, 1, 0, 1, 0],
1833 "F": [1, 0, 0, 0, 0],
1834 "G": [0, 0, 0, 0, 0],
1835 "H": [0, 1, 1, 0, 0],
1836 "I": [1, 0, 0, 0, 0],
1837 "K": [0, 1, 1, 0, 0],
1838 "L": [1, 0, 0, 0, 0],
1839 "M": [1, 0, 0, 0, 0],
1840 "N": [0, 1, 0, 0, 0],
1841 "P": [0, 0, 0, 0, 1],
1842 "Q": [0, 1, 0, 0, 0],
1843 "R": [0, 1, 1, 0, 0],
1844 "S": [0, 1, 0, 0, 0],
1845 "T": [0, 1, 0, 0, 0],
1846 "V": [1, 0, 0, 0, 0],
1847 "W": [1, 0, 0, 0, 0],
1848 "Y": [1, 0, 0, 0, 0],
1849 },
1850 "polarity": {
1851 "A": [0.395],
1852 "C": [0.074],
1853 "D": [1.0],
1854 "E": [0.914],
1855 "F": [0.037],
1856 "G": [0.506],
1857 "H": [0.679],
1858 "I": [0.037],
1859 "K": [0.79],
1860 "L": [0.0],
1861 "M": [0.099],
1862 "N": [0.827],
1863 "P": [0.383],
1864 "Q": [0.691],
1865 "R": [0.691],
1866 "S": [0.531],
1867 "T": [0.457],
1868 "V": [0.123],
1869 "W": [0.062],
1870 "Y": [0.16],
1871 },
1872 "ppcali": {
1873 "A": [
1874 0.070781,
1875 0.036271,
1876 2.042,
1877 0.083272,
1878 0.69089,
1879 0.15948,
1880 -0.80893,
1881 0.24698,
1882 0.86525,
1883 0.68563,
1884 -0.24665,
1885 0.61314,
1886 -0.53343,
1887 -0.50878,
1888 -1.3646,
1889 2.2679,
1890 -1.5644,
1891 -0.75043,
1892 -0.65875,
1893 ],
1894 "C": [
1895 0.61013,
1896 -0.93043,
1897 -0.85983,
1898 -2.2704,
1899 1.5877,
1900 -2.0066,
1901 -0.30314,
1902 1.2544,
1903 -0.2832,
1904 -1.2844,
1905 -0.73449,
1906 -0.11235,
1907 -0.41152,
1908 -0.0050164,
1909 0.28307,
1910 0.20522,
1911 -0.021084,
1912 -0.15627,
1913 -0.32689,
1914 ],
1915 "D": [
1916 -1.3215,
1917 0.24063,
1918 -0.032754,
1919 -0.37863,
1920 1.2051,
1921 1.0001,
1922 2.1827,
1923 0.19212,
1924 -0.60529,
1925 0.37639,
1926 -0.46451,
1927 -0.46788,
1928 1.4077,
1929 -2.1661,
1930 0.72604,
1931 -0.12332,
1932 -0.8243,
1933 -0.082989,
1934 0.053476,
1935 ],
1936 "E": [
1937 -0.87713,
1938 1.4905,
1939 1.0755,
1940 0.35944,
1941 1.567,
1942 0.41365,
1943 1.0944,
1944 0.72634,
1945 -0.74957,
1946 0.038939,
1947 0.075057,
1948 0.78637,
1949 -1.4543,
1950 1.6667,
1951 -0.097439,
1952 -0.24293,
1953 1.7687,
1954 0.36174,
1955 -0.11585,
1956 ],
1957 "F": [
1958 1.3557,
1959 -0.10336,
1960 -0.4309,
1961 0.41269,
1962 -0.083356,
1963 0.83783,
1964 0.095381,
1965 -0.65222,
1966 -0.3119,
1967 0.43293,
1968 -1.0011,
1969 -0.66855,
1970 -0.10242,
1971 1.2066,
1972 2.6234,
1973 1.9981,
1974 -0.25016,
1975 0.71979,
1976 0.21569,
1977 ],
1978 "G": [
1979 -1.0818,
1980 -2.1561,
1981 0.77082,
1982 -0.92747,
1983 -1.0748,
1984 1.7997,
1985 -1.3708,
1986 1.279,
1987 -1.2098,
1988 0.46065,
1989 0.43076,
1990 0.20037,
1991 -0.2302,
1992 0.2646,
1993 0.57149,
1994 -0.68432,
1995 0.19341,
1996 -0.061606,
1997 -0.08071,
1998 ],
1999 "H": [
2000 -0.050161,
2001 0.69246,
2002 -0.88397,
2003 -0.64601,
2004 0.24622,
2005 0.10487,
2006 -1.1317,
2007 -2.3661,
2008 -0.89918,
2009 0.46391,
2010 -0.62359,
2011 2.5478,
2012 -0.34737,
2013 -0.52062,
2014 0.17522,
2015 -0.88648,
2016 -0.4755,
2017 0.023187,
2018 -0.28261,
2019 ],
2020 "I": [
2021 1.4829,
2022 -0.46435,
2023 0.50189,
2024 0.55724,
2025 -0.51535,
2026 -0.29914,
2027 0.97236,
2028 -0.15793,
2029 -0.98246,
2030 -0.54347,
2031 0.97806,
2032 0.37577,
2033 1.618,
2034 0.62323,
2035 -0.59359,
2036 -0.35483,
2037 -0.085017,
2038 0.55825,
2039 -2.7542,
2040 ],
2041 "K": [
2042 -0.85344,
2043 1.529,
2044 0.27747,
2045 0.32993,
2046 -1.1786,
2047 -0.16633,
2048 -1.0459,
2049 0.44621,
2050 0.41027,
2051 -2.5318,
2052 0.91329,
2053 0.53385,
2054 0.61417,
2055 -1.111,
2056 1.1323,
2057 0.95105,
2058 0.76769,
2059 -0.016115,
2060 0.054995,
2061 ],
2062 "L": [
2063 1.2857,
2064 0.039488,
2065 1.5378,
2066 0.87969,
2067 -0.21419,
2068 0.40389,
2069 -0.20426,
2070 -0.14351,
2071 0.61024,
2072 -1.1927,
2073 -2.2149,
2074 -0.84248,
2075 -0.5061,
2076 -0.48548,
2077 0.10791,
2078 -2.1503,
2079 -0.12006,
2080 -0.60222,
2081 0.26546,
2082 ],
2083 "M": [
2084 1.137,
2085 0.64388,
2086 0.13724,
2087 -0.2988,
2088 1.2288,
2089 0.24981,
2090 -1.6427,
2091 -0.75868,
2092 -0.54902,
2093 1.0571,
2094 1.272,
2095 -1.9104,
2096 0.70919,
2097 -0.93575,
2098 -0.6314,
2099 -0.079654,
2100 1.634,
2101 -0.0021923,
2102 0.49825,
2103 ],
2104 "N": [
2105 -1.084,
2106 -0.176,
2107 -0.47062,
2108 -0.92245,
2109 -0.32953,
2110 0.74278,
2111 0.34551,
2112 -1.4605,
2113 0.25219,
2114 -1.2107,
2115 -0.59978,
2116 -0.79183,
2117 1.3268,
2118 1.9839,
2119 -1.6137,
2120 0.5333,
2121 0.033889,
2122 -1.0331,
2123 0.83019,
2124 ],
2125 "P": [
2126 -1.1823,
2127 -1.6911,
2128 -1.1331,
2129 3.073,
2130 1.1942,
2131 -0.93426,
2132 -0.72985,
2133 -0.042441,
2134 -0.19264,
2135 -0.21603,
2136 -0.1239,
2137 0.054016,
2138 0.15241,
2139 -0.019691,
2140 -0.20543,
2141 0.10206,
2142 0.07671,
2143 -0.081968,
2144 0.20348,
2145 ],
2146 "Q": [
2147 -0.57747,
2148 0.97452,
2149 -0.077547,
2150 -0.0033488,
2151 0.17184,
2152 -0.52537,
2153 -0.27362,
2154 -0.1366,
2155 0.2057,
2156 -0.013066,
2157 1.8834,
2158 -1.2736,
2159 -0.84991,
2160 1.0445,
2161 0.69027,
2162 -1.2866,
2163 -2.6776,
2164 0.1683,
2165 0.086105,
2166 ],
2167 "R": [
2168 -0.62245,
2169 1.545,
2170 -0.61966,
2171 0.19057,
2172 -1.7485,
2173 -1.3909,
2174 -0.47526,
2175 1.3938,
2176 -0.84556,
2177 1.7344,
2178 -1.6516,
2179 -0.52678,
2180 0.6791,
2181 0.24374,
2182 -0.62551,
2183 -0.0028271,
2184 -0.053884,
2185 0.14926,
2186 -0.17232,
2187 ],
2188 "S": [
2189 -0.86409,
2190 -0.77147,
2191 0.38542,
2192 -0.59389,
2193 -0.53313,
2194 -0.47585,
2195 0.31966,
2196 -0.89716,
2197 1.8029,
2198 0.26431,
2199 -0.23173,
2200 -0.37626,
2201 -0.47349,
2202 -0.42878,
2203 -0.47297,
2204 -0.079826,
2205 0.57043,
2206 3.2057,
2207 -0.18413,
2208 ],
2209 "T": [
2210 -0.33027,
2211 -0.57447,
2212 0.18653,
2213 -0.28941,
2214 -0.62681,
2215 -1.0737,
2216 0.80363,
2217 -0.59525,
2218 1.8786,
2219 1.3971,
2220 0.63929,
2221 0.21281,
2222 -0.067048,
2223 0.096271,
2224 1.323,
2225 -0.36173,
2226 1.2261,
2227 -2.2771,
2228 -0.65412,
2229 ],
2230 "V": [
2231 1.1675,
2232 -0.61554,
2233 0.95405,
2234 0.11662,
2235 -0.74473,
2236 -1.1482,
2237 1.1309,
2238 0.12079,
2239 -0.77171,
2240 0.18597,
2241 0.93442,
2242 1.201,
2243 0.3826,
2244 -0.091573,
2245 -0.31269,
2246 0.074367,
2247 -0.22946,
2248 0.24322,
2249 2.9836,
2250 ],
2251 "W": [
2252 1.1881,
2253 0.43789,
2254 -1.7915,
2255 0.138,
2256 0.43088,
2257 1.6467,
2258 -0.11987,
2259 1.7369,
2260 2.0818,
2261 0.33122,
2262 0.31829,
2263 1.1586,
2264 0.67649,
2265 0.30819,
2266 -0.55772,
2267 -0.54491,
2268 -0.17969,
2269 0.24477,
2270 0.38674,
2271 ],
2272 "Y": [
2273 0.54671,
2274 -0.1468,
2275 -1.5688,
2276 0.19001,
2277 -1.2736,
2278 0.66162,
2279 1.1614,
2280 -0.18614,
2281 -0.70654,
2282 -0.43634,
2283 0.44775,
2284 -0.71366,
2285 -2.5907,
2286 -1.1649,
2287 -1.1576,
2288 0.66572,
2289 0.21019,
2290 -0.61016,
2291 -0.34844,
2292 ],
2293 },
2294 "refractivity": {
2295 "A": [0.102045615],
2296 "C": [0.841053374],
2297 "D": [0.282153774],
2298 "E": [0.405831178],
2299 "F": [0.691276746],
2300 "G": [0],
2301 "H": [0.512814484],
2302 "I": [0.448154244],
2303 "K": [0.50058782],
2304 "L": [0.441570656],
2305 "M": [0.508817305],
2306 "N": [0.282153774],
2307 "P": [0.256995062],
2308 "Q": [0.405831178],
2309 "R": [0.626851634],
2310 "S": [0.149306372],
2311 "T": [0.258876087],
2312 "V": [0.327298378],
2313 "W": [1],
2314 "Y": [0.741359041],
2315 },
2316 "t_scale": {
2317 "A": [-8.4, -8.01, -3.73, -3.65, -6.12, -1.59, 1.56],
2318 "C": [-2.44, -1.96, 0.93, -2.35, 1.31, 2.29, -1.52],
2319 "D": [-6.84, -0.94, 17.68, -0.03, 3.44, 9.07, 4.32],
2320 "E": [-6.5, 16.2, 17.28, 3.11, -4.75, -2.54, 4.72],
2321 "F": [21.59, -5.73, 1.03, -3.3, 2.64, -5.02, 1.7],
2322 "G": [-8.48, -10.37, -5.14, -6.51, -11.84, -3.6, 2.01],
2323 "H": [15.28, -3.67, 6.72, -6.38, 4.12, -1.55, -2.85],
2324 "I": [-2.97, 4.64, -0.77, 11, 3.26, -4.36, -7.88],
2325 "K": [2.7, 13.46, -14.03, -2.55, 2.77, 0.15, 3.19],
2326 "L": [2.61, 5.96, 1.97, 2.59, -4.77, -4.84, -5.44],
2327 "M": [3.38, 12.43, -4.77, 0.45, -1.55, -0.6, 3.26],
2328 "N": [-3.11, -1.22, 6.26, -9.38, 9.94, 7.66, -4.81],
2329 "P": [-5.35, -9.07, -1.52, -8.79, -8.73, 4.29, -9.91],
2330 "Q": [-5.31, 15.64, 8.44, 1.03, -4.32, -4.4, -0.52],
2331 "R": [-2.27, 18.9, -18.24, -3.47, 3.03, 6.64, 0.45],
2332 "S": [-15.88, -11.21, -2.44, -3.61, 3.46, -0.37, 8.98],
2333 "T": [-17.81, -13.64, -5.19, 10.57, 6.91, -4.43, 3.49],
2334 "V": [-5.8, -6.15, -2.26, 9.87, 5.28, -1.49, -7.54],
2335 "W": [21.68, -8.78, -2.53, 15.53, -8.15, 11.98, 3.23],
2336 "Y": [23.9, -6.47, 0.31, -4.14, 4.08, -7.28, 3.59],
2337 },
2338 "tm_tend": {
2339 "A": [0.38],
2340 "C": [-0.3],
2341 "D": [-3.27],
2342 "E": [-2.9],
2343 "F": [1.98],
2344 "G": [-0.19],
2345 "H": [-1.44],
2346 "I": [1.97],
2347 "K": [-3.46],
2348 "L": [1.82],
2349 "M": [1.4],
2350 "N": [-1.62],
2351 "P": [-1.44],
2352 "Q": [-1.84],
2353 "R": [-2.57],
2354 "S": [-0.53],
2355 "T": [-0.32],
2356 "V": [1.46],
2357 "W": [1.53],
2358 "Y": [0.49],
2359 },
2360 "z3": {
2361 "A": [0.07, -1.73, 0.09],
2362 "C": [0.71, -0.97, 4.13],
2363 "D": [3.64, 1.13, 2.36],
2364 "E": [3.08, 0.39, -0.07],
2365 "F": [-4.92, 1.3, 0.45],
2366 "G": [2.23, -5.36, 0.3],
2367 "H": [2.41, 1.74, 1.11],
2368 "I": [-4.44, -1.68, -1.03],
2369 "K": [2.84, 1.41, -3.14],
2370 "L": [-4.19, -1.03, -0.98],
2371 "M": [-2.49, -0.27, -0.41],
2372 "N": [3.22, 1.45, 0.84],
2373 "P": [-1.22, 0.88, 2.23],
2374 "Q": [2.18, 0.53, -1.14],
2375 "R": [2.88, 2.52, -3.44],
2376 "S": [1.96, -1.63, 0.57],
2377 "T": [0.92, -2.09, -1.4],
2378 "V": [-2.69, -2.53, -1.29],
2379 "W": [-4.75, 3.65, 0.85],
2380 "Y": [-1.39, 2.32, 0.01],
2381 },
2382 "z5": {
2383 "A": [0.24, -2.32, 0.6, -0.14, 1.3],
2384 "C": [0.84, -1.67, 3.71, 0.18, -2.65],
2385 "D": [3.98, 0.93, 1.93, -2.46, 0.75],
2386 "E": [3.11, 0.26, -0.11, -3.04, -0.25],
2387 "F": [-4.22, 1.94, 1.06, 0.54, -0.62],
2388 "G": [2.05, -4.06, 0.36, -0.82, -0.38],
2389 "H": [2.47, 1.95, 0.26, 3.9, 0.09],
2390 "I": [-3.89, -1.73, -1.71, -0.84, 0.26],
2391 "K": [2.29, 0.89, -2.49, 1.49, 0.31],
2392 "L": [-4.28, -1.3, -1.49, -0.72, 0.84],
2393 "M": [-2.85, -0.22, 0.47, 1.94, -0.98],
2394 "N": [3.05, 1.62, 1.04, -1.15, 1.61],
2395 "P": [-1.66, 0.27, 1.84, 0.7, 2],
2396 "Q": [1.75, 0.5, -1.44, -1.34, 0.66],
2397 "R": [3.52, 2.5, -3.5, 1.99, -0.17],
2398 "S": [2.39, -1.07, 1.15, -1.39, 0.67],
2399 "T": [0.75, -2.18, -1.12, -1.46, -0.4],
2400 "V": [-2.59, -2.64, -1.54, -0.85, -0.02],
2401 "W": [-4.36, 3.94, 0.59, 3.44, -1.59],
2402 "Y": [-2.54, 2.44, 0.43, 0.04, -1.47],
2403 },
2404 }
2405 if scalename == "all":
2406 d = {
2407 "I": [],
2408 "F": [],
2409 "V": [],
2410 "L": [],
2411 "W": [],
2412 "M": [],
2413 "A": [],
2414 "G": [],
2415 "C": [],
2416 "Y": [],
2417 "P": [],
2418 "T": [],
2419 "S": [],
2420 "H": [],
2421 "E": [],
2422 "N": [],
2423 "Q": [],
2424 "D": [],
2425 "K": [],
2426 "R": [],
2427 }
2428 for scale in scales.keys():
2429 for k, v in scales[scale].items():
2430 d[k].extend(v)
2431 return "all", d
2432
2433 elif scalename == "instability":
2434 d = {
2435 "A": {
2436 "A": 1.0,
2437 "C": 44.94,
2438 "E": 1.0,
2439 "D": -7.49,
2440 "G": 1.0,
2441 "F": 1.0,
2442 "I": 1.0,
2443 "H": -7.49,
2444 "K": 1.0,
2445 "M": 1.0,
2446 "L": 1.0,
2447 "N": 1.0,
2448 "Q": 1.0,
2449 "P": 20.26,
2450 "S": 1.0,
2451 "R": 1.0,
2452 "T": 1.0,
2453 "W": 1.0,
2454 "V": 1.0,
2455 "Y": 1.0,
2456 },
2457 "C": {
2458 "A": 1.0,
2459 "C": 1.0,
2460 "E": 1.0,
2461 "D": 20.26,
2462 "G": 1.0,
2463 "F": 1.0,
2464 "I": 1.0,
2465 "H": 33.6,
2466 "K": 1.0,
2467 "M": 33.6,
2468 "L": 20.26,
2469 "N": 1.0,
2470 "Q": -6.54,
2471 "P": 20.26,
2472 "S": 1.0,
2473 "R": 1.0,
2474 "T": 33.6,
2475 "W": 24.68,
2476 "V": -6.54,
2477 "Y": 1.0,
2478 },
2479 "E": {
2480 "A": 1.0,
2481 "C": 44.94,
2482 "E": 33.6,
2483 "D": 20.26,
2484 "G": 1.0,
2485 "F": 1.0,
2486 "I": 20.26,
2487 "H": -6.54,
2488 "K": 1.0,
2489 "M": 1.0,
2490 "L": 1.0,
2491 "N": 1.0,
2492 "Q": 20.26,
2493 "P": 20.26,
2494 "S": 20.26,
2495 "R": 1.0,
2496 "T": 1.0,
2497 "W": -14.03,
2498 "V": 1.0,
2499 "Y": 1.0,
2500 },
2501 "D": {
2502 "A": 1.0,
2503 "C": 1.0,
2504 "E": 1.0,
2505 "D": 1.0,
2506 "G": 1.0,
2507 "F": -6.54,
2508 "I": 1.0,
2509 "H": 1.0,
2510 "K": -7.49,
2511 "M": 1.0,
2512 "L": 1.0,
2513 "N": 1.0,
2514 "Q": 1.0,
2515 "P": 1.0,
2516 "S": 20.26,
2517 "R": -6.54,
2518 "T": -14.03,
2519 "W": 1.0,
2520 "V": 1.0,
2521 "Y": 1.0,
2522 },
2523 "G": {
2524 "A": -7.49,
2525 "C": 1.0,
2526 "E": -6.54,
2527 "D": 1.0,
2528 "G": 13.34,
2529 "F": 1.0,
2530 "I": -7.49,
2531 "H": 1.0,
2532 "K": -7.49,
2533 "M": 1.0,
2534 "L": 1.0,
2535 "N": -7.49,
2536 "Q": 1.0,
2537 "P": 1.0,
2538 "S": 1.0,
2539 "R": 1.0,
2540 "T": -7.49,
2541 "W": 13.34,
2542 "V": 1.0,
2543 "Y": -7.49,
2544 },
2545 "F": {
2546 "A": 1.0,
2547 "C": 1.0,
2548 "E": 1.0,
2549 "D": 13.34,
2550 "G": 1.0,
2551 "F": 1.0,
2552 "I": 1.0,
2553 "H": 1.0,
2554 "K": -14.03,
2555 "M": 1.0,
2556 "L": 1.0,
2557 "N": 1.0,
2558 "Q": 1.0,
2559 "P": 20.26,
2560 "S": 1.0,
2561 "R": 1.0,
2562 "T": 1.0,
2563 "W": 1.0,
2564 "V": 1.0,
2565 "Y": 33.601,
2566 },
2567 "I": {
2568 "A": 1.0,
2569 "C": 1.0,
2570 "E": 44.94,
2571 "D": 1.0,
2572 "G": 1.0,
2573 "F": 1.0,
2574 "I": 1.0,
2575 "H": 13.34,
2576 "K": -7.49,
2577 "M": 1.0,
2578 "L": 20.26,
2579 "N": 1.0,
2580 "Q": 1.0,
2581 "P": -1.88,
2582 "S": 1.0,
2583 "R": 1.0,
2584 "T": 1.0,
2585 "W": 1.0,
2586 "V": -7.49,
2587 "Y": 1.0,
2588 },
2589 "H": {
2590 "A": 1.0,
2591 "C": 1.0,
2592 "E": 1.0,
2593 "D": 1.0,
2594 "G": -9.37,
2595 "F": -9.37,
2596 "I": 44.94,
2597 "H": 1.0,
2598 "K": 24.68,
2599 "M": 1.0,
2600 "L": 1.0,
2601 "N": 24.68,
2602 "Q": 1.0,
2603 "P": -1.88,
2604 "S": 1.0,
2605 "R": 1.0,
2606 "T": -6.54,
2607 "W": -1.88,
2608 "V": 1.0,
2609 "Y": 44.94,
2610 },
2611 "K": {
2612 "A": 1.0,
2613 "C": 1.0,
2614 "E": 1.0,
2615 "D": 1.0,
2616 "G": -7.49,
2617 "F": 1.0,
2618 "I": -7.49,
2619 "H": 1.0,
2620 "K": 1.0,
2621 "M": 33.6,
2622 "L": -7.49,
2623 "N": 1.0,
2624 "Q": 24.64,
2625 "P": -6.54,
2626 "S": 1.0,
2627 "R": 33.6,
2628 "T": 1.0,
2629 "W": 1.0,
2630 "V": -7.49,
2631 "Y": 1.0,
2632 },
2633 "M": {
2634 "A": 13.34,
2635 "C": 1.0,
2636 "E": 1.0,
2637 "D": 1.0,
2638 "G": 1.0,
2639 "F": 1.0,
2640 "I": 1.0,
2641 "H": 58.28,
2642 "K": 1.0,
2643 "M": -1.88,
2644 "L": 1.0,
2645 "N": 1.0,
2646 "Q": -6.54,
2647 "P": 44.94,
2648 "S": 44.94,
2649 "R": -6.54,
2650 "T": -1.88,
2651 "W": 1.0,
2652 "V": 1.0,
2653 "Y": 24.68,
2654 },
2655 "L": {
2656 "A": 1.0,
2657 "C": 1.0,
2658 "E": 1.0,
2659 "D": 1.0,
2660 "G": 1.0,
2661 "F": 1.0,
2662 "I": 1.0,
2663 "H": 1.0,
2664 "K": -7.49,
2665 "M": 1.0,
2666 "L": 1.0,
2667 "N": 1.0,
2668 "Q": 33.6,
2669 "P": 20.26,
2670 "S": 1.0,
2671 "R": 20.26,
2672 "T": 1.0,
2673 "W": 24.68,
2674 "V": 1.0,
2675 "Y": 1.0,
2676 },
2677 "N": {
2678 "A": 1.0,
2679 "C": -1.88,
2680 "E": 1.0,
2681 "D": 1.0,
2682 "G": -14.03,
2683 "F": -14.03,
2684 "I": 44.94,
2685 "H": 1.0,
2686 "K": 24.68,
2687 "M": 1.0,
2688 "L": 1.0,
2689 "N": 1.0,
2690 "Q": -6.54,
2691 "P": -1.88,
2692 "S": 1.0,
2693 "R": 1.0,
2694 "T": -7.49,
2695 "W": -9.37,
2696 "V": 1.0,
2697 "Y": 1.0,
2698 },
2699 "Q": {
2700 "A": 1.0,
2701 "C": -6.54,
2702 "E": 20.26,
2703 "D": 20.26,
2704 "G": 1.0,
2705 "F": -6.54,
2706 "I": 1.0,
2707 "H": 1.0,
2708 "K": 1.0,
2709 "M": 1.0,
2710 "L": 1.0,
2711 "N": 1.0,
2712 "Q": 20.26,
2713 "P": 20.26,
2714 "S": 44.94,
2715 "R": 1.0,
2716 "T": 1.0,
2717 "W": 1.0,
2718 "V": -6.54,
2719 "Y": -6.54,
2720 },
2721 "P": {
2722 "A": 20.26,
2723 "C": -6.54,
2724 "E": 18.38,
2725 "D": -6.54,
2726 "G": 1.0,
2727 "F": 20.26,
2728 "I": 1.0,
2729 "H": 1.0,
2730 "K": 1.0,
2731 "M": -6.54,
2732 "L": 1.0,
2733 "N": 1.0,
2734 "Q": 20.26,
2735 "P": 20.26,
2736 "S": 20.26,
2737 "R": -6.54,
2738 "T": 1.0,
2739 "W": -1.88,
2740 "V": 20.26,
2741 "Y": 1.0,
2742 },
2743 "S": {
2744 "A": 1.0,
2745 "C": 33.6,
2746 "E": 20.26,
2747 "D": 1.0,
2748 "G": 1.0,
2749 "F": 1.0,
2750 "I": 1.0,
2751 "H": 1.0,
2752 "K": 1.0,
2753 "M": 1.0,
2754 "L": 1.0,
2755 "N": 1.0,
2756 "Q": 20.26,
2757 "P": 44.94,
2758 "S": 20.26,
2759 "R": 20.26,
2760 "T": 1.0,
2761 "W": 1.0,
2762 "V": 1.0,
2763 "Y": 1.0,
2764 },
2765 "R": {
2766 "A": 1.0,
2767 "C": 1.0,
2768 "E": 1.0,
2769 "D": 1.0,
2770 "G": -7.49,
2771 "F": 1.0,
2772 "I": 1.0,
2773 "H": 20.26,
2774 "K": 1.0,
2775 "M": 1.0,
2776 "L": 1.0,
2777 "N": 13.34,
2778 "Q": 20.26,
2779 "P": 20.26,
2780 "S": 44.94,
2781 "R": 58.28,
2782 "T": 1.0,
2783 "W": 58.28,
2784 "V": 1.0,
2785 "Y": -6.54,
2786 },
2787 "T": {
2788 "A": 1.0,
2789 "C": 1.0,
2790 "E": 20.26,
2791 "D": 1.0,
2792 "G": -7.49,
2793 "F": 13.34,
2794 "I": 1.0,
2795 "H": 1.0,
2796 "K": 1.0,
2797 "M": 1.0,
2798 "L": 1.0,
2799 "N": -14.03,
2800 "Q": -6.54,
2801 "P": 1.0,
2802 "S": 1.0,
2803 "R": 1.0,
2804 "T": 1.0,
2805 "W": -14.03,
2806 "V": 1.0,
2807 "Y": 1.0,
2808 },
2809 "W": {
2810 "A": -14.03,
2811 "C": 1.0,
2812 "E": 1.0,
2813 "D": 1.0,
2814 "G": -9.37,
2815 "F": 1.0,
2816 "I": 1.0,
2817 "H": 24.68,
2818 "K": 1.0,
2819 "M": 24.68,
2820 "L": 13.34,
2821 "N": 13.34,
2822 "Q": 1.0,
2823 "P": 1.0,
2824 "S": 1.0,
2825 "R": 1.0,
2826 "T": -14.03,
2827 "W": 1.0,
2828 "V": -7.49,
2829 "Y": 1.0,
2830 },
2831 "V": {
2832 "A": 1.0,
2833 "C": 1.0,
2834 "E": 1.0,
2835 "D": -14.03,
2836 "G": -7.49,
2837 "F": 1.0,
2838 "I": 1.0,
2839 "H": 1.0,
2840 "K": -1.88,
2841 "M": 1.0,
2842 "L": 1.0,
2843 "N": 1.0,
2844 "Q": 1.0,
2845 "P": 20.26,
2846 "S": 1.0,
2847 "R": 1.0,
2848 "T": -7.49,
2849 "W": 1.0,
2850 "V": 1.0,
2851 "Y": -6.54,
2852 },
2853 "Y": {
2854 "A": 24.68,
2855 "C": 1.0,
2856 "E": -6.54,
2857 "D": 24.68,
2858 "G": -7.49,
2859 "F": 1.0,
2860 "I": 1.0,
2861 "H": 13.34,
2862 "K": 1.0,
2863 "M": 44.94,
2864 "L": 1.0,
2865 "N": 1.0,
2866 "Q": 1.0,
2867 "P": 13.34,
2868 "S": 1.0,
2869 "R": -15.91,
2870 "T": -7.49,
2871 "W": -9.37,
2872 "V": 1.0,
2873 "Y": 13.34,
2874 },
2875 }
2876 return "instability", d
2877
2878 else:
2879 return scalename, scales[scalename]
2880
2881
2882 def read_fasta(inputfile):
2883 """Method for loading sequences from a FASTA formatted file into :py:attr:`sequences` & :py:attr:`names`.
2884 This method is used by the base class :class:`modlamp.descriptors.PeptideDescriptor` if the input is a FASTA file.
2885
2886 :param inputfile: .fasta file with sequences and headers to read
2887 :return: list of sequences in the attribute :py:attr:`sequences` with corresponding sequence names in
2888 :py:attr:`names`.
2889 """
2890 names = list() # list for storing names
2891 sequences = list() # list for storing sequences
2892 seq = str()
2893 with open(inputfile) as f:
2894 all = f.readlines()
2895 last = all[-1]
2896 for line in all:
2897 if line.startswith(">"):
2898 names.append(
2899 line.split(" ")[0][1:].strip()
2900 ) # add FASTA name without description as molecule name
2901 sequences.append(seq.strip())
2902 seq = str()
2903 elif line == last:
2904 seq += line.strip() # remove potential white space
2905 sequences.append(seq.strip())
2906 else:
2907 seq += line.strip() # remove potential white space
2908 return sequences[1:], names
2909
2910
2911 def save_fasta(filename, sequences, names=None):
2912 """Method for saving sequences in the instance :py:attr:`sequences` to a file in FASTA format.
2913
2914 :param filename: {str} output filename (ending .fasta)
2915 :param sequences: {list} sequences to be saved to file
2916 :param names: {list} whether sequence names from self.names should be saved as sequence identifiers
2917 :return: a FASTA formatted file containing the generated sequences
2918 """
2919 if os.path.exists(filename):
2920 os.remove(filename) # remove outputfile, it it exists
2921
2922 with open(filename, "w") as o:
2923 for n, seq in enumerate(sequences):
2924 if names:
2925 o.write(">" + str(names[n]) + "\n")
2926 else:
2927 o.write(">Seq_" + str(n) + "\n")
2928 o.write(seq + "\n")
2929
2930
2931 def aa_weights():
2932 """Function holding molecular weight data on all natural amino acids.
2933
2934 :return: dictionary with amino acid letters and corresponding weights
2935
2936 .. versionadded:: v2.4.1
2937 """
2938 weights = {
2939 "A": 89.093,
2940 "C": 121.158,
2941 "D": 133.103,
2942 "E": 147.129,
2943 "F": 165.189,
2944 "G": 75.067,
2945 "H": 155.155,
2946 "I": 131.173,
2947 "K": 146.188,
2948 "L": 131.173,
2949 "M": 149.211,
2950 "N": 132.118,
2951 "P": 115.131,
2952 "Q": 146.145,
2953 "R": 174.20,
2954 "S": 105.093,
2955 "T": 119.119,
2956 "V": 117.146,
2957 "W": 204.225,
2958 "Y": 181.189,
2959 }
2960 return weights
2961
2962
2963 def count_aas(seq, scale="relative"):
2964 """Function to count the amino acids occuring in a given sequence.
2965
2966 :param seq: {str} amino acid sequence
2967 :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA
2968 :return: {dict} dictionary with amino acids as keys and their counts in the sequence as values.
2969 """
2970 if seq == "": # error if len(seq) == 0
2971 seq = " "
2972 aas = [
2973 "A",
2974 "C",
2975 "D",
2976 "E",
2977 "F",
2978 "G",
2979 "H",
2980 "I",
2981 "K",
2982 "L",
2983 "M",
2984 "N",
2985 "P",
2986 "Q",
2987 "R",
2988 "S",
2989 "T",
2990 "V",
2991 "W",
2992 "Y",
2993 ]
2994 scl = 1.0
2995 if scale == "relative":
2996 scl = len(seq)
2997 aa = {a: (float(seq.count(a)) / scl) for a in aas}
2998 aa = collections.OrderedDict(sorted(list(aa.items())))
2999 return aa
3000
3001
3002 def count_ngrams(seq, n):
3003 """Function to count the n-grams of an amino acid sequence. N can be one integer or a list of integers
3004
3005 :param seq: {str} amino acid sequence
3006 :param n: {int or list of ints} defines whether counts or frequencies are given for each AA
3007 :return: {dict} dictionary with n-grams as keys and their counts in the sequence as values.
3008 """
3009 if seq == "":
3010 seq = " "
3011 if isinstance(n, int):
3012 n = [n]
3013 ngrams = list()
3014 for i in n:
3015 ngrams.extend([seq[j : j + i] for j in range(len(seq) - (i - 1))])
3016 counts = {g: (seq.count(g)) for g in set(ngrams)}
3017 counts = collections.OrderedDict(
3018 sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
3019 )
3020 return counts
3021
3022
3023 def aa_energies():
3024 """Function holding free energies of transfer between cyclohexane and water for all natural amino acids.
3025 H. G. Boman, D. Wade, I. a Boman, B. Wåhlin, R. B. Merrifield, *FEBS Lett*. **1989**, *259*, 103–106.
3026
3027 :return: dictionary with amino acid letters and corresponding energies.
3028 """
3029 energies = {
3030 "L": -4.92,
3031 "I": -4.92,
3032 "V": -4.04,
3033 "F": -2.98,
3034 "M": -2.35,
3035 "W": -2.33,
3036 "A": -1.81,
3037 "C": -1.28,
3038 "G": -0.94,
3039 "Y": 0.14,
3040 "T": 2.57,
3041 "S": 3.40,
3042 "H": 4.66,
3043 "Q": 5.54,
3044 "K": 5.55,
3045 "N": 6.64,
3046 "E": 6.81,
3047 "D": 8.72,
3048 "R": 14.92,
3049 "P": 0.0,
3050 }
3051 return energies
3052
3053
3054 def ngrams_apd():
3055 """Function returning the most frequent 2-, 3- and 4-grams from all sequences in the `APD3
3056 <http://aps.unmc.edu/AP/>`_, version August 2016 with 2727 sequences.
3057 For all 2, 3 and 4grams, all possible ngrams were generated from all sequences and the top 50 most frequent
3058 assembled into a list. Finally, leading and tailing spaces were striped and duplicates as well as ngrams containing
3059 spaces were removed.
3060
3061 :return: numpy.array containing most frequent ngrams
3062 """
3063 return np.array(
3064 [
3065 "AGK",
3066 "CKI",
3067 "RR",
3068 "YGGG",
3069 "LSGL",
3070 "RG",
3071 "YGGY",
3072 "PRP",
3073 "LGGG",
3074 "GV",
3075 "GT",
3076 "GS",
3077 "GR",
3078 "IAG",
3079 "GG",
3080 "GF",
3081 "GC",
3082 "GGYG",
3083 "GA",
3084 "GL",
3085 "GK",
3086 "GI",
3087 "IPC",
3088 "KAA",
3089 "LAK",
3090 "GLGG",
3091 "GGLG",
3092 "CKIT",
3093 "GAGK",
3094 "LLSG",
3095 "LKK",
3096 "FLP",
3097 "LSG",
3098 "SCK",
3099 "LLS",
3100 "GETC",
3101 "VLG",
3102 "GKLL",
3103 "LLG",
3104 "C",
3105 "KCKI",
3106 "G",
3107 "VGK",
3108 "CSC",
3109 "TKKC",
3110 "GCS",
3111 "GKA",
3112 "IGK",
3113 "GESC",
3114 "KVCY",
3115 "KKL",
3116 "KKI",
3117 "KKC",
3118 "LGGL",
3119 "GLL",
3120 "CGE",
3121 "GGYC",
3122 "GLLS",
3123 "GLF",
3124 "AKK",
3125 "GKAA",
3126 "ESCV",
3127 "GLP",
3128 "CGES",
3129 "PCGE",
3130 "FL",
3131 "CGET",
3132 "GLW",
3133 "KGAA",
3134 "KAAL",
3135 "GGY",
3136 "GGG",
3137 "IKG",
3138 "LKG",
3139 "GGL",
3140 "CK",
3141 "GTC",
3142 "CG",
3143 "SKKC",
3144 "CS",
3145 "CR",
3146 "KC",
3147 "AGKA",
3148 "KA",
3149 "KG",
3150 "LKCK",
3151 "SCKL",
3152 "KK",
3153 "KI",
3154 "KN",
3155 "KL",
3156 "SK",
3157 "KV",
3158 "SL",
3159 "SC",
3160 "SG",
3161 "AAA",
3162 "VAK",
3163 "AAL",
3164 "AAK",
3165 "GGGG",
3166 "KNVA",
3167 "GGGL",
3168 "GYG",
3169 "LG",
3170 "LA",
3171 "LL",
3172 "LK",
3173 "LS",
3174 "LP",
3175 "GCSC",
3176 "TC",
3177 "GAA",
3178 "AA",
3179 "VA",
3180 "VC",
3181 "AG",
3182 "VG",
3183 "AI",
3184 "AK",
3185 "VL",
3186 "AL",
3187 "TPGC",
3188 "IK",
3189 "IA",
3190 "IG",
3191 "YGG",
3192 "LGK",
3193 "CSCK",
3194 "GYGG",
3195 "LGG",
3196 "KGA",
3197 ]
3198 )
3199
3200
3201 def aa_formulas():
3202 """
3203 Function returning the molecular formulas of all amino acids. All amino acids are considered in the neutral form
3204 (uncharged).
3205 """
3206 formulas = {
3207 "A": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 0},
3208 "C": {"C": 3, "H": 7, "N": 1, "O": 2, "S": 1},
3209 "D": {"C": 4, "H": 7, "N": 1, "O": 4, "S": 0},
3210 "E": {"C": 5, "H": 9, "N": 1, "O": 4, "S": 0},
3211 "F": {"C": 9, "H": 11, "N": 1, "O": 2, "S": 0},
3212 "G": {"C": 2, "H": 5, "N": 1, "O": 2, "S": 0},
3213 "H": {"C": 6, "H": 9, "N": 3, "O": 2, "S": 0},
3214 "I": {"C": 6, "H": 13, "N": 1, "O": 2, "S": 0},
3215 "K": {"C": 6, "H": 14, "N": 2, "O": 2, "S": 0},
3216 "L": {"C": 6, "H": 13, "N": 1, "O": 2, "S": 0},
3217 "M": {"C": 5, "H": 11, "N": 1, "O": 2, "S": 1},
3218 "N": {"C": 4, "H": 8, "N": 2, "O": 3, "S": 0},
3219 "P": {"C": 5, "H": 9, "N": 1, "O": 2, "S": 0},
3220 "Q": {"C": 5, "H": 10, "N": 2, "O": 3, "S": 0},
3221 "R": {"C": 6, "H": 14, "N": 4, "O": 2, "S": 0},
3222 "S": {"C": 3, "H": 7, "N": 1, "O": 3, "S": 0},
3223 "T": {"C": 4, "H": 9, "N": 1, "O": 3, "S": 0},
3224 "V": {"C": 5, "H": 11, "N": 1, "O": 2, "S": 0},
3225 "W": {"C": 11, "H": 12, "N": 2, "O": 2, "S": 0},
3226 "Y": {"C": 9, "H": 11, "N": 1, "O": 3, "S": 0},
3227 }
3228 return formulas