# HG changeset patch
# User cpt
# Date 1656998494 0
# Node ID 9caa9aa44fd8c41cdf8492555ef0bd414443a7b8
Uploaded
diff -r 000000000000 -r 9caa9aa44fd8 cpt_helical_wheel/cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_helical_wheel/cpt-macros.xml Tue Jul 05 05:21:34 2022 +0000
@@ -0,0 +1,115 @@
+
+
+
+
+ python
+ biopython
+ requests
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 000000000000 -r 9caa9aa44fd8 cpt_helical_wheel/generateHelicalWheel.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_helical_wheel/generateHelicalWheel.py Tue Jul 05 05:21:34 2022 +0000
@@ -0,0 +1,86 @@
+##
+
+import argparse
+from plotWheels.helical_wheel import helical_wheel
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Generate Helical Wheel")
+ parser.add_argument("--sequence",dest="sequence",type=str)
+ parser.add_argument("--seqRange",dest="seqRange",type=int,default=1)
+ parser.add_argument("--t_size",dest="t_size",type=int,default=32)
+ parser.add_argument("--rotation",dest="rotation",type=int,default=90)
+ parser.add_argument("--numbering",action="store_true",help="numbering for helical wheel")
+ parser.add_argument("--output",dest="output",type=argparse.FileType("wb"), default="_helicalwheel.png")#dest="output",default="_helicalwheel.png")
+ #### circle colors
+ parser.add_argument("--f_A",dest="f_A", default="#ffcc33")
+ parser.add_argument("--f_C",dest="f_C",default="#b5b5b5")
+ parser.add_argument("--f_D",dest="f_D",default="#db270f")
+ parser.add_argument("--f_E",dest="f_E",default="#db270f")
+ parser.add_argument("--f_F",dest="f_F",default="#ffcc33")
+ parser.add_argument("--f_G",dest="f_G",default="#b5b5b5")
+ parser.add_argument("--f_H",dest="f_H",default="#12d5fc")
+ parser.add_argument("--f_I",dest="f_I",default="#ffcc33")
+ parser.add_argument("--f_K",dest="f_K",default="#12d5fc")
+ parser.add_argument("--f_L",dest="f_L",default="#ffcc33")
+ parser.add_argument("--f_M",dest="f_M",default="#ffcc33")
+ parser.add_argument("--f_N",dest="f_N",default="#b5b5b5")
+ parser.add_argument("--f_P",dest="f_P",default="#ffcc33")
+ parser.add_argument("--f_Q",dest="f_Q",default="#b5b5b5")
+ parser.add_argument("--f_R",dest="f_R",default="#12d5fc")
+ parser.add_argument("--f_S",dest="f_S",default="#b5b5b5")
+ parser.add_argument("--f_T",dest="f_T",default="#b5b5b5")
+ parser.add_argument("--f_V",dest="f_V",default="#ffcc33")
+ parser.add_argument("--f_W",dest="f_W",default="#ffcc33")
+ parser.add_argument("--f_Y",dest="f_Y",default="#b5b5b5")
+ ### text colors
+ parser.add_argument("--t_A",dest="t_A",default="k")
+ parser.add_argument("--t_C",dest="t_C",default="k")
+ parser.add_argument("--t_D",dest="t_D",default="w")
+ parser.add_argument("--t_E",dest="t_E",default="w")
+ parser.add_argument("--t_F",dest="t_F",default="k")
+ parser.add_argument("--t_G",dest="t_G",default="k")
+ parser.add_argument("--t_H",dest="t_H",default="k")
+ parser.add_argument("--t_I",dest="t_I",default="k")
+ parser.add_argument("--t_K",dest="t_K",default="k")
+ parser.add_argument("--t_L",dest="t_L",default="k")
+ parser.add_argument("--t_M",dest="t_M",default="k")
+ parser.add_argument("--t_N",dest="t_N",default="k")
+ parser.add_argument("--t_P",dest="t_P",default="k")
+ parser.add_argument("--t_Q",dest="t_Q",default="k")
+ parser.add_argument("--t_R",dest="t_R",default="k")
+ parser.add_argument("--t_S",dest="t_S",default="k")
+ parser.add_argument("--t_T",dest="t_T",default="k")
+ parser.add_argument("--t_V",dest="t_V",default="k")
+ parser.add_argument("--t_W",dest="t_W",default="k")
+ parser.add_argument("--t_Y",dest="t_Y",default="k")
+
+ args = parser.parse_args()
+
+
+ #print(type(args.output))
+
+ f_colors = [args.f_A,args.f_C,args.f_D,args.f_E,args.f_F,args.f_G,args.f_H,args.f_I,args.f_K,
+ args.f_L,args.f_M,args.f_N,args.f_P,args.f_Q,args.f_R,args.f_S,args.f_T,args.f_V,
+ args.f_W,args.f_Y]
+
+ t_colors = [args.t_A,args.t_C,args.t_D,args.t_E,args.t_F,args.t_G,args.t_H,args.t_I,args.t_K,
+ args.t_L,args.t_M,args.t_N,args.t_P,args.t_Q,args.t_R,args.t_S,args.t_T,args.t_V,
+ args.t_W,args.t_Y]
+
+ colors = [f_colors, t_colors]
+
+ tmp_file = "./tmp.png"
+
+ helical_wheel(sequence=args.sequence,
+ colorcoding=colors[0],
+ text_color=colors[1],
+ seqRange=args.seqRange,
+ t_size=args.t_size,
+ rot=args.rotation,
+ numbering=args.numbering,
+ filename=tmp_file
+ )
+
+ with open("tmp.png", "rb") as f:
+ for line in f:
+ args.output.write(line)
diff -r 000000000000 -r 9caa9aa44fd8 cpt_helical_wheel/generateHelicalWheel.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_helical_wheel/generateHelicalWheel.xml Tue Jul 05 05:21:34 2022 +0000
@@ -0,0 +1,388 @@
+
+
+ Generate and Plot a Protein Helical Wheel
+
+ cpt-macros.xml
+ macros.xml
+
+
+ numpy
+ pandas
+ scikit-learn
+ scipy
+ matplotlib
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Paste in exact sequence to be plotted - Input Sequence of desired helical wheel plot
+> Label Start Number - Numerical value that represents the beginning of the sequence (default 1)
+> Amino Acid Text Size - Size of text for helical wheel (default 32)
+> Rotation - Degrees to rotate helical wheel (defaul 90)
+color parameters :
+> Background Color and Text Color Selections
+METHOD : Using the core features from the modlAMP python module, a helical wheel projection is constructed.
+OUTPUT : _helicalWheel.png
+NOTES : Peptide lengths longer than 36 residues will not properly graph.
+]]>
+
+ 10.1093/bioinformatics/btx285
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
diff -r 000000000000 -r 9caa9aa44fd8 cpt_helical_wheel/macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_helical_wheel/macros.xml Tue Jul 05 05:21:34 2022 +0000
@@ -0,0 +1,56 @@
+
+
+
+
+ python
+ biopython
+ cpt_gffparser
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ "$gff3_data"
+
+
+#if str($reference_genome.reference_genome_source) == 'cached':
+ "${reference_genome.fasta_indexes.fields.path}"
+#else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+#end if
+
+
+#if $reference_genome.reference_genome_source == 'history':
+ ln -s $reference_genome.genome_fasta genomeref.fa;
+#end if
+
+
+#if str($reference_genome.reference_genome_source) == 'cached':
+ "${reference_genome.fasta_indexes.fields.path}"
+#else if str($reference_genome.reference_genome_source) == 'history':
+ genomeref.fa
+#end if
+
+
diff -r 000000000000 -r 9caa9aa44fd8 cpt_helical_wheel/plotWheels/__init__.py
diff -r 000000000000 -r 9caa9aa44fd8 cpt_helical_wheel/plotWheels/core.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_helical_wheel/plotWheels/core.py Tue Jul 05 05:21:34 2022 +0000
@@ -0,0 +1,1223 @@
+# -*- coding: utf-8 -*-
+"""
+.. currentmodule:: modlamp.core
+
+.. moduleauthor:: modlab Alex Mueller ETH Zurich
+
+Core helper functions and classes for other modules. The two main classes are:
+
+============================= =======================================================================================
+Class Characteristics
+============================= =======================================================================================
+:py:class:`BaseSequence` Base class inheriting to all sequence classes in the module :py:mod:`modlamp.sequences`
+:py:class:`BaseDescriptor` Base class inheriting to the two descriptor classes in :py:mod:`modlamp.descriptors`
+============================= =======================================================================================
+"""
+
+import os
+import random
+import re
+
+import numpy as np
+import pandas as pd
+import collections
+import operator
+from scipy.spatial import distance
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+from sklearn.utils import shuffle
+
+__author__ = "Alex Müller, Gisela Gabernet"
+__docformat__ = "restructuredtext en"
+
+
+class BaseSequence(object):
+ """Base class for sequence classes in the module :mod:`modlamp.sequences`.
+ It contains amino acid probabilities for different sequence generation classes.
+
+ The following amino acid probabilities are used: (extracted from the
+ `APD3 `_, March 17, 2016)
+
+ === ==== ====== ========= ==========
+ AA rand AMP AMPnoCM randnoCM
+ === ==== ====== ========= ==========
+ A 0.05 0.0766 0.0812275 0.05555555
+ C 0.05 0.071 0.0 0.0
+ D 0.05 0.026 0.0306275 0.05555555
+ E 0.05 0.0264 0.0310275 0.05555555
+ F 0.05 0.0405 0.0451275 0.05555555
+ G 0.05 0.1172 0.1218275 0.05555555
+ H 0.05 0.021 0.0256275 0.05555555
+ I 0.05 0.061 0.0656275 0.05555555
+ K 0.05 0.0958 0.1004275 0.05555555
+ L 0.05 0.0838 0.0884275 0.05555555
+ M 0.05 0.0123 0.0 0.0
+ N 0.05 0.0386 0.0432275 0.05555555
+ P 0.05 0.0463 0.0509275 0.05555555
+ Q 0.05 0.0251 0.0297275 0.05555555
+ R 0.05 0.0545 0.0591275 0.05555555
+ S 0.05 0.0613 0.0659275 0.05555555
+ T 0.05 0.0455 0.0501275 0.05555555
+ V 0.05 0.0572 0.0618275 0.05555555
+ W 0.05 0.0155 0.0201275 0.05555555
+ Y 0.05 0.0244 0.0290275 0.05555555
+ === ==== ====== ========= ==========
+
+ """
+
+ def __init__(self, seqnum, lenmin=7, lenmax=28):
+ """
+ :param seqnum: number of sequences to generate
+ :param lenmin: minimal length of the generated sequences
+ :param lenmax: maximal length of the generated sequences
+ :return: attributes :py:attr:`seqnum`, :py:attr:`lenmin` and :py:attr:`lenmax`.
+ :Example:
+
+ >>> b = BaseSequence(10, 7, 28)
+ >>> b.seqnum
+ 10
+ >>> b.lenmin
+ 7
+ >>> b.lenmax
+ 28
+ """
+ self.sequences = list()
+ self.names = list()
+ self.lenmin = int(lenmin)
+ self.lenmax = int(lenmax)
+ self.seqnum = int(seqnum)
+
+ # AA classes:
+ self.AA_hyd = ['G', 'A', 'L', 'I', 'V']
+ self.AA_basic = ['K', 'R']
+ self.AA_acidic = ['D', 'E']
+ self.AA_aroma = ['W', 'Y', 'F']
+ self.AA_polar = ['S', 'T', 'Q', 'N']
+ # AA labels:
+ self.AAs = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
+ # AA probability from the APD3 database:
+ self.prob_AMP = [0.0766, 0.071, 0.026, 0.0264, 0.0405, 0.1172, 0.021, 0.061, 0.0958, 0.0838, 0.0123, 0.0386,
+ 0.0463, 0.0251, 0.0545, 0.0613, 0.0455, 0.0572, 0.0155, 0.0244]
+ # AA probability from the APD2 database without Cys and Met (synthesis reasons)
+ self.prob_AMPnoCM = [0.081228, 0., 0.030627, 0.031027, 0.045128, 0.121828, 0.025627, 0.065628, 0.100428,
+ 0.088428, 0., 0.043228, 0.050928, 0.029728, 0.059128, 0.065927, 0.050128, 0.061828,
+ 0.020128, 0.029028]
+ # equal AA probabilities:
+ self.prob = [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05,
+ 0.05, 0.05, 0.05, 0.05]
+ # equal AA probabilities but 0 for Cys and Met:
+ self.prob_randnoCM = [0.05555555555, 0.0, 0.05555555555, 0.05555555555, 0.05555555555, 0.05555555555,
+ 0.05555555555, 0.05555555555, 0.05555555555, 0.05555555555, 0.0, 0.05555555555,
+ 0.05555555555, 0.05555555555, 0.05555555555, 0.05555555555, 0.05555555555, 0.05555555555,
+ 0.05555555555, 0.05555555555]
+
+ # AA probability from the linear CancerPPD peptides:
+ self.prob_ACP = [0.14526966, 0., 0.00690031, 0.00780824, 0.06991102, 0.04957327, 0.01725077, 0.05647358,
+ 0.27637552, 0.17759216, 0.00998729, 0.00798983, 0.01307427, 0.00381333, 0.02941711,
+ 0.02651171, 0.0154349, 0.04013074, 0.0406755, 0.00581079]
+
+ # AA probabilities for perfect amphipathic helix of different arc sizes
+ self.prob_amphihel = [[0.04545455, 0., 0.04545454, 0.04545455, 0., 0.04545455, 0.04545455, 0., 0.25, 0., 0.,
+ 0.04545454, 0.04545455, 0.04545454, 0.25, 0.04545454, 0.04545454, 0., 0., 0.04545454],
+ [0., 0., 0., 0., 0.16666667, 0., 0., 0.16666667, 0., 0.16666667, 0., 0., 0., 0., 0., 0.,
+ 0., 0.16666667, 0.16666667, (1. - 0.16666667 * 5)]]
+
+ # helical ACP AA probabilities, depending on the position of the AA in the helix.
+ self.prob_ACPhel = np.array([[0.0483871, 0., 0., 0.0483871, 0.01612903, 0.12903226, 0.03225807, 0.09677419,
+ 0.19354839, 0.5, 0.0483871, 0.11290323, 0.1, 0.18518519, 0.07843137, 0.12,
+ 0.17073172, 0.16666667],
+ [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.01612903, 0., 0., 0., 0., 0.,
+ 0.02439024,
+ 0.19444444],
+ [0., 0.01612903, 0., 0.27419355, 0.01612903, 0., 0., 0.01612903, 0., 0., 0., 0.,
+ 0.,
+ 0., 0., 0., 0., 0.],
+ [0., 0., 0., 0., 0., 0., 0., 0.06451613, 0., 0.01612903, 0.0483871, 0.01612903, 0.,
+ 0.01851852, 0., 0., 0., 0.],
+ [0.16129032, 0.0483871, 0.30645161, 0., 0.0483871, 0., 0., 0.01612903, 0.,
+ 0.01612903,
+ 0., 0.09677419, 0.06666667, 0.01851852, 0., 0.02, 0.14634146, 0.],
+ [0.64516129, 0., 0.17741936, 0.14516129, 0., 0.01612903, 0.25806452, 0.11290323,
+ 0.06451613, 0.08064516, 0.22580645, 0.03225807, 0.06666667, 0.2037037, 0.1372549,
+ 0.1, 0., 0.05555556],
+ [0., 0., 0., 0.01612903, 0., 0., 0.01612903, 0., 0.03225807, 0., 0., 0.20967742,
+ 0.,
+ 0., 0., 0.16, 0., 0.],
+ [0.0483871, 0.11290323, 0.01612903, 0.08064516, 0.33870968, 0.27419355, 0.,
+ 0.0483871, 0.14516129, 0.06451613, 0.03225807, 0.06451613, 0.18333333, 0., 0.,
+ 0.1, 0.26829268, 0.],
+ [0., 0.03225807, 0.01612903, 0.12903226, 0.12903226, 0., 0.38709677, 0.33870968,
+ 0.0483871, 0.03225807, 0.41935484, 0.08064516, 0., 0.03703704, 0.29411765,
+ 0.04, 0.02439024, 0.02777778],
+ [0.0483871, 0.70967742, 0.12903226, 0.0483871, 0.09677419, 0.32258064, 0.20967742,
+ 0.06451613, 0.11290323, 0.06451613, 0.03225807, 0.03225807, 0.28333333,
+ 0.24074074,
+ 0.03921569, 0.28, 0.07317073, 0.22222222],
+ [0., 0.01612903, 0.01612903, 0.0483871, 0.01612903, 0.03225807, 0., 0., 0., 0.,
+ 0., 0., 0.03333333, 0., 0.01960784, 0.02, 0., 0.],
+ [0., 0.01612903, 0., 0., 0., 0., 0., 0., 0.01612903, 0., 0.03225807, 0., 0., 0.,
+ 0.01960784, 0.02, 0., 0.],
+ [0., 0., 0.14516129, 0.01612903, 0.03225807, 0.01612903, 0., 0., 0., 0.,
+ 0.01612903, 0., 0., 0.12962963, 0.17647059, 0., 0., 0.],
+ [0., 0., 0.01612903, 0.01612903, 0., 0., 0.01612903, 0., 0.01612903, 0., 0.,
+ 0.01612903, 0., 0.01851852, 0., 0., 0., 0.],
+ [0., 0.01612903, 0.01612903, 0., 0.01612903, 0., 0.01612903, 0., 0.01612903,
+ 0.01612903, 0.01612903, 0.01612903, 0., 0.01851852, 0.01960784, 0., 0.04878049,
+ 0.],
+ [0.01612903, 0., 0.01612903, 0.12903226, 0.03225807, 0.03225807, 0.0483871,
+ 0.17741936, 0., 0.03225807, 0.09677419, 0.0483871, 0.01666667, 0., 0.15686274,
+ 0.1, 0., 0.05555556],
+ [0.01612903, 0.01612903, 0., 0.01612903, 0.0483871, 0.01612903, 0., 0.01612903, 0.,
+ 0.01612903, 0.01612903, 0.11290323, 0., 0.01851852, 0.03921569, 0.02, 0.,
+ 0.05555556],
+ [0.01612903, 0.01612903, 0.01612903, 0.01612903, 0.20967742, 0.16129032,
+ 0.01612903,
+ 0.0483871, 0.33870968, 0.16129032, 0., 0.14516129, 0.25, 0.11111111, 0.01960784,
+ 0.02, 0.21951219, 0.22222222],
+ [0., 0., 0.12903226, 0.01612903, 0., 0., 0., 0., 0.01612903, 0., 0., 0., 0., 0.,
+ 0.,
+ 0., 0.02439024, 0.],
+ [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.01612903, 0., 0., 0., 0., 0., 0.]])
+
+ def save_fasta(self, filename, names=False):
+ """Method to save generated sequences in a ``.FASTA`` formatted file.
+
+ :param filename: output filename in which the sequences from :py:attr:`sequences` are safed in fasta format.
+ :param names: {bool} whether sequence names from :py:attr:`names` should be saved as sequence identifiers
+ :return: a FASTA formatted file containing the generated sequences
+ :Example:
+
+ >>> b = BaseSequence(2)
+ >>> b.sequences = ['KLLSLSLALDLLS', 'KLPERTVVNSSDF']
+ >>> b.names = ['Sequence1', 'Sequence2']
+ >>> b.save_fasta('/location/of/fasta/file.fasta', names=True)
+ """
+ if names:
+ save_fasta(filename, self.sequences, self.names)
+ else:
+ save_fasta(filename, self.sequences)
+
+ def mutate_AA(self, nr, prob):
+ """Method to mutate with **prob** probability a **nr** of positions per sequence randomly.
+
+ :param nr: number of mutations to perform per sequence
+ :param prob: probability of mutating a sequence
+ :return: mutated sequences in the attribute :py:attr:`sequences`.
+ :Example:
+
+ >>> b = BaseSequence(1)
+ >>> b.sequences = ['IAKAGRAIIK']
+ >>> b.mutate_AA(3, 1.)
+ >>> b.sequences
+ ['NAKAGRAWIK']
+ """
+ for s in range(len(self.sequences)):
+ # mutate: yes or no? prob = mutation probability
+ mutate = np.random.choice([1, 0], 1, p=[prob, 1 - float(prob)])
+ if mutate == 1:
+ seq = list(self.sequences[s])
+ cnt = 0
+ while cnt < nr: # mutate "nr" AA
+ seq[random.choice(range(len(seq)))] = random.choice(self.AAs)
+ cnt += 1
+ self.sequences[s] = ''.join(seq)
+
+ def filter_duplicates(self):
+ """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences`
+
+ :return: filtered sequences list in the attribute :py:attr:`sequences` and corresponding names.
+ :Example:
+
+ >>> b = BaseSequence(4)
+ >>> b.sequences = ['KLLKLLKKLLKLLK', 'KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK', 'KLAKLAKKLAKLAK']
+ >>> b.filter_duplicates()
+ >>> b.sequences
+ ['KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK']
+
+ .. versionadded:: v2.2.5
+ """
+ if not self.names:
+ self.names = ['Seq_' + str(i) for i in range(len(self.sequences))]
+ df = pd.DataFrame(list(zip(self.sequences, self.names)), columns=['Sequences', 'Names'])
+ df = df.drop_duplicates('Sequences', 'first') # keep first occurrence of duplicate
+ self.sequences = df['Sequences'].get_values().tolist()
+ self.names = df['Names'].get_values().tolist()
+
+ def keep_natural_aa(self):
+ """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character
+ that is not in ``['A','C','D,'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']``.
+
+ :return: filtered sequence list in the attribute :py:attr:`sequences`. The other attributes are also filtered
+ accordingly (if present).
+ :Example:
+
+ >>> b = BaseSequence(2)
+ >>> b.sequences = ['BBBsdflUasUJfBJ', 'GLFDIVKKVVGALGSL']
+ >>> b.keep_natural_aa()
+ >>> b.sequences
+ ['GLFDIVKKVVGALGSL']
+ """
+ natural_aa = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W',
+ 'Y']
+
+ seqs = []
+ names = []
+
+ for i, s in enumerate(self.sequences):
+ seq = list(s.upper())
+ if all(c in natural_aa for c in seq):
+ seqs.append(s.upper())
+ if hasattr(self, 'names') and self.names:
+ names.append(self.names[i])
+
+ self.sequences = seqs
+ self.names = names
+
+ def filter_aa(self, amino_acids):
+ """Method to filter out corresponding names and descriptor values of sequences with given amino acids in the
+ argument list *aminoacids*.
+
+ :param amino_acids: {list} amino acids to be filtered
+ :return: filtered list of sequences names in the corresponding attributes.
+ :Example:
+
+ >>> b = BaseSequence(3)
+ >>> b.sequences = ['AAALLLIIIKKK', 'CCEERRT', 'LLVVIIFFFQQ']
+ >>> b.filter_aa(['C'])
+ >>> b.sequences
+ ['AAALLLIIIKKK', 'LLVVIIFFFQQ']
+ """
+
+ pattern = re.compile('|'.join(amino_acids))
+ seqs = []
+ names = []
+
+ for i, s in enumerate(self.sequences):
+ if not pattern.search(s):
+ seqs.append(s)
+ if hasattr(self, 'names') and self.names:
+ names.append(self.names[i])
+
+ self.sequences = seqs
+ self.names = names
+
+ def clean(self):
+ """Method to clean / clear / empty the attributes :py:attr:`sequences` and :py:attr:`names`.
+
+ :return: freshly initialized, empty class attributes.
+ """
+ self.__init__(self.seqnum, self.lenmin, self.lenmax)
+
+
+class BaseDescriptor(object):
+ """
+ Base class inheriting to both peptide descriptor classes :py:class:`modlamp.descriptors.GlobalDescriptor` and
+ :py:class:`modlamp.descriptors.PeptideDescriptor`.
+ """
+
+ def __init__(self, seqs):
+ """
+ :param seqs: a ``.FASTA`` file with sequences, a list / array of sequences or a single sequence as string to
+ calculate the descriptor values for.
+ :return: initialized attributes :py:attr:`sequences` and :py:attr:`names`.
+ :Example:
+
+ >>> AMP = BaseDescriptor('KLLKLLKKLLKLLK','pepCATS')
+ >>> AMP.sequences
+ ['KLLKLLKKLLKLLK']
+ >>> seqs = BaseDescriptor('/Path/to/file.fasta', 'eisenberg') # load sequences from .fasta file
+ >>> seqs.sequences
+ ['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'...]
+ """
+ if type(seqs) == list and seqs[0].isupper():
+ self.sequences = [s.strip() for s in seqs]
+ self.names = []
+ elif type(seqs) == np.ndarray and seqs[0].isupper():
+ self.sequences = [s.strip() for s in seqs.tolist()]
+ self.names = []
+ elif type(seqs) == str and seqs.isupper():
+ self.sequences = [seqs.strip()]
+ self.names = []
+ elif os.path.isfile(seqs):
+ if seqs.endswith('.fasta'): # read .fasta file
+ self.sequences, self.names = read_fasta(seqs)
+ elif seqs.endswith('.csv'): # read .csv file with sequences every line
+ with open(seqs) as f:
+ self.sequences = list()
+ cntr = 0
+ self.names = []
+ for line in f:
+ if line.isupper():
+ self.sequences.append(line.strip())
+ self.names.append('seq_' + str(cntr))
+ cntr += 1
+ else:
+ print("Sorry, currently only .fasta or .csv files can be read!")
+ else:
+ print("%s does not exist, is not a valid list of AA sequences or is not a valid sequence string" % seqs)
+
+ self.descriptor = np.array([[]])
+ self.target = np.array([], dtype='int')
+ self.scaler = None
+ self.featurenames = []
+
+ def read_fasta(self, filename):
+ """Method for loading sequences from a ``.FASTA`` formatted file into the attributes :py:attr:`sequences` and
+ :py:attr:`names`.
+
+ :param filename: {str} ``.FASTA`` file with sequences and headers to read
+ :return: {list} sequences in the attribute :py:attr:`sequences` with corresponding sequence names in
+ :py:attr:`names`.
+ """
+ self.sequences, self.names = read_fasta(filename)
+
+ def save_fasta(self, filename, names=False):
+ """Method for saving sequences from :py:attr:`sequences` to a ``.FASTA`` formatted file.
+
+ :param filename: {str} filename of the output ``.FASTA`` file
+ :param names: {bool} whether sequence names from self.names should be saved as sequence identifiers
+ :return: a FASTA formatted file containing the generated sequences
+ """
+ if names:
+ save_fasta(filename, self.sequences, self.names)
+ else:
+ save_fasta(filename, self.sequences)
+
+ def count_aa(self, scale='relative', average=False, append=False):
+ """Method for producing the amino acid distribution for the given sequences as a descriptor
+
+ :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA
+ :param average: {boolean} whether the averaged amino acid counts for all sequences should be returned
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: the amino acid distributions for every sequence individually in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> AMP = PeptideDescriptor('ACDEFGHIKLMNPQRSTVWY') # aa_count() does not depend on the descriptor scale
+ >>> AMP.count_aa()
+ >>> AMP.descriptor
+ array([[ 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, ... ]])
+ >>> AMP.descriptor.shape
+ (1, 20)
+
+ .. seealso:: :py:func:`modlamp.core.count_aa()`
+ """
+ desc = list()
+ for seq in self.sequences:
+ od = count_aas(seq, scale)
+ desc.append(list(od.values()))
+
+ desc = np.array(desc)
+ self.featurenames = list(od.keys())
+
+ if append:
+ self.descriptor = np.hstack((self.descriptor, desc))
+ elif average:
+ self.descriptor = np.mean(desc, axis=0)
+ else:
+ self.descriptor = desc
+
+ def count_ngrams(self, n):
+ """Method for producing n-grams of all sequences in self.sequences
+
+ :param n: {int or list of ints} defines whether counts or frequencies are given for each AA
+ :return: {dict} dictionary with n-grams as keys and their counts in the sequence as values in :py:attr:`descriptor`
+ :Example:
+
+ >>> D = PeptideDescriptor('GLLDFLSLAALSLDKLVKKGALS')
+ >>> D.count_ngrams([2, 3])
+ >>> D.descriptor
+ {'LS': 3, 'LD': 2, 'LSL': 2, 'AL': 2, ..., 'LVK': 1}
+
+ .. seealso:: :py:func:`modlamp.core.count_ngrams()`
+ """
+ ngrams = dict()
+ for seq in self.sequences:
+ d = count_ngrams(seq, n)
+ for k, v in d.items():
+ if k in ngrams.keys():
+ ngrams[k] += v
+ else:
+ ngrams[k] = v
+ self.descriptor = ngrams
+
+ def feature_scaling(self, stype='standard', fit=True):
+ """Method for feature scaling of the calculated descriptor matrix.
+
+ :param stype: {'standard' or 'minmax'} type of scaling to be used
+ :param fit: {boolean} defines whether the used scaler is first fitting on the data (True) or
+ whether the already fitted scaler in :py:attr:`scaler` should be used to transform (False).
+ :return: scaled descriptor values in :py:attr:`descriptor`
+ :Example:
+
+ >>> D.descriptor
+ array([[0.155],[0.34],[0.16235294],[-0.08842105],[0.116]])
+ >>> D.feature_scaling(type='minmax',fit=True)
+ array([[0.56818182],[1.],[0.5853447],[0.],[0.47714988]])
+ """
+ if stype in ['standard', 'minmax']:
+ if stype == 'standard':
+ self.scaler = StandardScaler()
+ elif stype == 'minmax':
+ self.scaler = MinMaxScaler()
+
+ if fit:
+ self.descriptor = self.scaler.fit_transform(self.descriptor)
+ else:
+ self.descriptor = self.scaler.transform(self.descriptor)
+ else:
+ print("Unknown scaler type!\nAvailable: 'standard', 'minmax'")
+
+ def feature_shuffle(self):
+ """Method for shuffling feature columns randomly.
+
+ :return: descriptor matrix with shuffled feature columns in :py:attr:`descriptor`
+ :Example:
+
+ >>> D.descriptor
+ array([[0.80685625,167.05234375,39.56818125,-0.26338667,155.16888667,33.48778]])
+ >>> D.feature_shuffle()
+ array([[155.16888667,-0.26338667,167.05234375,0.80685625,39.56818125,33.48778]])
+ """
+ self.descriptor = shuffle(self.descriptor.transpose()).transpose()
+
+ def sequence_order_shuffle(self):
+ """Method for shuffling sequence order in the attribute :py:attr:`sequences`.
+
+ :return: sequences in :py:attr:`sequences` with shuffled order in the list.
+ :Example:
+
+ >>> D.sequences
+ ['LILRALKGAARALKVA','VKIAKIALKIIKGLG','VGVRLIKGIGRVARGAI','LRGLRGVIRGGKAIVRVGK','GGKLVRLIARIGKGV']
+ >>> D.sequence_order_shuffle()
+ >>> D.sequences
+ ['VGVRLIKGIGRVARGAI','LILRALKGAARALKVA','LRGLRGVIRGGKAIVRVGK','GGKLVRLIARIGKGV','VKIAKIALKIIKGLG']
+ """
+ self.sequences = shuffle(self.sequences)
+
+ def random_selection(self, num):
+ """Method to randomly select a specified number of sequences (with names and descriptors if present) out of a given
+ descriptor instance.
+
+ :param num: {int} number of entries to be randomly selected
+ :return: updated instance
+ :Example:
+
+ >>> h = Helices(7, 28, 100)
+ >>> h.generate_helices()
+ >>> desc = PeptideDescriptor(h.sequences, 'eisenberg')
+ >>> desc.calculate_moment()
+ >>> len(desc.sequences)
+ 100
+ >>> len(desc.descriptor)
+ 100
+ >>> desc.random_selection(10)
+ >>> len(desc.descriptor)
+ 10
+ >>> len(desc.descriptor)
+ 10
+
+ .. versionadded:: v2.2.3
+ """
+
+ sel = np.random.choice(len(self.sequences), size=num, replace=False)
+ self.sequences = np.array(self.sequences)[sel].tolist()
+ if hasattr(self, 'descriptor') and self.descriptor.size:
+ self.descriptor = self.descriptor[sel]
+ if hasattr(self, 'names') and self.names:
+ self.names = np.array(self.names)[sel].tolist()
+ if hasattr(self, 'target') and self.target.size:
+ self.target = self.target[sel]
+
+ def minmax_selection(self, iterations, distmetric='euclidean', seed=0):
+ """Method to select a specified number of sequences according to the minmax algorithm.
+
+ :param iterations: {int} Number of sequences to retrieve.
+ :param distmetric: Distance metric to calculate the distances between the sequences in descriptor space.
+ Choose from 'euclidean' or 'minkowsky'.
+ :param seed: {int} Set a random seed for numpy to pick the first sequence.
+ :return: updated instance
+
+ .. seealso:: **SciPy** http://docs.scipy.org/doc/scipy/reference/spatial.distance.html
+ """
+
+ # Storing M into pool, where selections get deleted
+ pool = self.descriptor # Store pool where selections get deleted
+ minmaxidx = list() # Store original indices of selections to return
+
+ # Randomly selecting first peptide into the sele
+ np.random.seed(seed)
+ idx = int(np.random.random_integers(0, len(pool), 1))
+ sele = pool[idx:idx + 1, :]
+ minmaxidx.append(int(*np.where(np.all(self.descriptor == pool[idx:idx + 1, :], axis=1))))
+
+ # Deleting peptide in selection from pool
+ pool = np.delete(pool, idx, axis=0)
+
+ for i in range(iterations - 1):
+ # Calculating distance from sele to the rest of the peptides
+ dist = distance.cdist(pool, sele, distmetric)
+
+ # Choosing maximal distances for every sele instance
+ maxidx = np.argmax(dist, axis=0)
+ maxcols = np.max(dist, axis=0)
+
+ # Choosing minimal distance among the maximal distances
+ minmax = np.argmin(maxcols)
+ maxidx = int(maxidx[minmax])
+
+ # Adding it to selection and removing from pool
+ sele = np.append(sele, pool[maxidx:maxidx + 1, :], axis=0)
+ pool = np.delete(pool, maxidx, axis=0)
+ minmaxidx.append(int(*np.where(np.all(self.descriptor == pool[maxidx:maxidx + 1, :], axis=1))))
+
+ self.sequences = np.array(self.sequences)[minmaxidx].tolist()
+ if hasattr(self, 'descriptor') and self.descriptor.size:
+ self.descriptor = self.descriptor[minmaxidx]
+ if hasattr(self, 'names') and self.names:
+ self.names = np.array(self.names)[minmaxidx].tolist()
+ if hasattr(self, 'target') and self.target.size:
+ self.target = self.descriptor[minmaxidx]
+
+ def filter_sequences(self, sequences):
+ """Method to filter out entries for given sequences in *sequences* out of a descriptor instance. All
+ corresponding attribute values of these sequences (e.g. in :py:attr:`descriptor`, :py:attr:`name`) are deleted
+ as well. The method returns an updated descriptor instance.
+
+ :param sequences: {list} sequences to be filtered out of the whole instance, including corresponding data
+ :return: updated instance without filtered sequences
+ :Example:
+
+ >>> sequences = ['KLLKLLKKLLKLLK', 'ACDEFGHIK', 'GLFDIVKKVV', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALGSL']
+ >>> desc = PeptideDescriptor(sequences, 'pepcats')
+ >>> desc.calculate_crosscorr(7)
+ >>> len(desc.descriptor)
+ 5
+ >>> desc.filter_sequences('KLLKLLKKLLKLLK')
+ >>> len(desc.descriptor)
+ 4
+ >>> desc.sequences
+ ['ACDEFGHIK', 'GLFDIVKKVV', 'GLFDIVKKVVGALG', 'GLFDIVKKVVGALGSL']
+ """
+ indices = list()
+ if isinstance(sequences, str): # check if sequences is only one sequence string and convert it to a list
+ sequences = [sequences]
+ for s in sequences: # get indices of queried sequences
+ indices.append(self.sequences.index(s))
+
+ self.sequences = np.delete(np.array(self.sequences), indices, 0).tolist()
+ if hasattr(self, 'descriptor') and self.descriptor.size:
+ self.descriptor = np.delete(self.descriptor, indices, 0)
+ if hasattr(self, 'names') and self.names:
+ self.names = np.delete(np.array(self.names), indices, 0).tolist()
+ if hasattr(self, 'target') and self.target.size:
+ self.target = np.delete(self.target, indices, 0)
+
+ def filter_values(self, values, operator='=='):
+ """Method to filter the descriptor matrix in the attribute :py:attr:`descriptor` for a given list of values (same
+ size as the number of features in the descriptor matrix!) The operator option tells the method whether to
+ filter for values equal, lower, higher ect. to the given values in the *values* array.
+
+ :param values: {list} values to filter the attribute :py:attr:`descriptor` for
+ :param operator: {str} filter criterion, available the operators ``==``, ``<``, ``>``, ``<=``and ``>=``.
+ :return: descriptor matrix and updated sequences containing only entries with descriptor values given in
+ *values* in the corresponding attributes.
+ :Example:
+
+ >>> desc.descriptor # desc = BaseDescriptor instance
+ array([[ 0.7666517 ],
+ [ 0.38373498]])
+ >>> desc.filter_values([0.5], '<')
+ >>> desc.descriptor
+ array([[ 0.38373498]])
+ """
+ dim = self.descriptor.shape[1]
+ for d in range(dim): # for all the features in self.descriptor
+ if operator == '==':
+ indices = np.where(self.descriptor[:, d] == values[d])[0]
+ elif operator == '<':
+ indices = np.where(self.descriptor[:, d] < values[d])[0]
+ elif operator == '>':
+ indices = np.where(self.descriptor[:, d] > values[d])[0]
+ elif operator == '<=':
+ indices = np.where(self.descriptor[:, d] <= values[d])[0]
+ elif operator == '>=':
+ indices = np.where(self.descriptor[:, d] >= values[d])[0]
+ else:
+ raise KeyError('available operators: ``==``, ``<``, ``>``, ``<=``and ``>=``')
+
+ # filter descriptor matrix, sequence list and names list according to obtained indices
+ self.sequences = np.array(self.sequences)[indices].tolist()
+ if hasattr(self, 'descriptor') and self.descriptor.size:
+ self.descriptor = self.descriptor[indices]
+ if hasattr(self, 'names') and self.names:
+ self.names = np.array(self.names)[indices].tolist()
+ if hasattr(self, 'target') and self.target.size:
+ self.target = self.target[indices]
+
+ def filter_aa(self, amino_acids):
+ """Method to filter out corresponding names and descriptor values of sequences with given amino acids in the
+ argument list *aminoacids*.
+
+ :param amino_acids: list of amino acids to be filtered
+ :return: filtered list of sequences, descriptor values, target values and names in the corresponding attributes.
+ :Example:
+
+ >>> b = BaseSequence(3)
+ >>> b.sequences = ['AAALLLIIIKKK', 'CCEERRT', 'LLVVIIFFFQQ']
+ >>> b.filter_aa(['C'])
+ >>> b.sequences
+ ['AAALLLIIIKKK', 'LLVVIIFFFQQ']
+ """
+
+ pattern = re.compile('|'.join(amino_acids))
+ seqs = []
+ desc = []
+ names = []
+ target = []
+
+ for i, s in enumerate(self.sequences):
+ if not pattern.search(s):
+ seqs.append(s)
+ if hasattr(self, 'descriptor') and self.descriptor.size:
+ desc.append(self.descriptor[i])
+ if hasattr(self, 'names') and self.names:
+ names.append(self.names[i])
+ if hasattr(self, 'target') and self.target.size:
+ target.append(self.target[i])
+
+ self.sequences = seqs
+ self.names = names
+ self.descriptor = np.array(desc)
+ self.target = np.array(target, dtype='int')
+
+ def filter_duplicates(self):
+ """Method to filter duplicates in the sequences from the class attribute :py:attr:`sequences`
+
+ :return: filtered sequences list in the attribute :py:attr:`sequences` and corresponding names.
+ :Example:
+
+ >>> b = BaseDescriptor(['KLLKLLKKLLKLLK', 'KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK', 'KLAKLAKKLAKLAK'])
+ >>> b.filter_duplicates()
+ >>> b.sequences
+ ['KLLKLLKKLLKLLK', 'KLAKLAKKLAKLAK']
+
+ .. versionadded:: v2.2.5
+ """
+ if not self.names:
+ self.names = ['Seq_' + str(i) for i in range(len(self.sequences))]
+ if not self.target:
+ self.target = [0] * len(self.sequences)
+ if not self.descriptor:
+ self.descriptor = np.zeros(len(self.sequences))
+ df = pd.DataFrame(np.array([self.sequences, self.names, self.descriptor, self.target]).T,
+ columns=['Sequences', 'Names', 'Descriptor', 'Target'])
+ df = df.drop_duplicates('Sequences', 'first') # keep first occurrence of duplicate
+ self.sequences = df['Sequences'].get_values().tolist()
+ self.names = df['Names'].get_values().tolist()
+ self.descriptor = df['Descriptor'].get_values()
+ self.target = df['Target'].get_values()
+
+ def keep_natural_aa(self):
+ """Method to filter out sequences that do not contain natural amino acids. If the sequence contains a character
+ that is not in ['A','C','D,'E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'].
+
+ :return: filtered sequence list in the attribute :py:attr:`sequences`. The other attributes are also filtered
+ accordingly (if present).
+ :Example:
+
+ >>> b = BaseSequence(2)
+ >>> b.sequences = ['BBBsdflUasUJfBJ', 'GLFDIVKKVVGALGSL']
+ >>> b.keep_natural_aa()
+ >>> b.sequences
+ ['GLFDIVKKVVGALGSL']
+ """
+
+ natural_aa = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W',
+ 'Y']
+
+ seqs = []
+ desc = []
+ names = []
+ target = []
+
+ for i, s in enumerate(self.sequences):
+ seq = list(s.upper())
+ if all(c in natural_aa for c in seq):
+ seqs.append(s.upper())
+ if hasattr(self, 'descriptor') and self.descriptor.size:
+ desc.append(self.descriptor[i])
+ if hasattr(self, 'names') and self.names:
+ names.append(self.names[i])
+ if hasattr(self, 'target') and self.target.size:
+ target.append(self.target[i])
+
+ self.sequences = seqs
+ self.names = names
+ self.descriptor = np.array(desc)
+ self.target = np.array(target, dtype='int')
+
+ def load_descriptordata(self, filename, delimiter=",", targets=False, skip_header=0):
+ """Method to load any data file with sequences and descriptor values and save it to a new insatnce of the
+ class :class:`modlamp.descriptors.PeptideDescriptor`.
+
+ .. note:: Headers are not considered. To skip initial lines in the file, use the *skip_header* option.
+
+ :param filename: {str} filename of the data file to be loaded
+ :param delimiter: {str} column delimiter
+ :param targets: {boolean} whether last column in the file contains a target class vector
+ :param skip_header: {int} number of initial lines to skip in the file
+ :return: loaded sequences, descriptor values and targets in the corresponding attributes.
+ """
+ data = np.genfromtxt(filename, delimiter=delimiter, skip_header=skip_header)
+ data = data[:, 1:] # skip sequences as they are "nan" when read as float
+ seqs = np.genfromtxt(filename, delimiter=delimiter, dtype="str")
+ seqs = seqs[:, 0]
+ if targets:
+ self.target = np.array(data[:, -1], dtype='int')
+ self.sequences = seqs
+ self.descriptor = data
+
+ def save_descriptor(self, filename, delimiter=',', targets=None, header=None):
+ """Method to save the descriptor values to a .csv/.txt file
+
+ :param filename: filename of the output file
+ :param delimiter: column delimiter
+ :param targets: target class vector to be added to descriptor (same length as :py:attr:`sequences`)
+ :param header: {str} header to be written at the beginning of the file (if ``None``: feature names are taken)
+ :return: output file with peptide names and descriptor values
+ """
+ seqs = np.array(self.sequences, dtype='|S80')[:, np.newaxis]
+ ids = np.array(self.names, dtype='|S80')[:, np.newaxis]
+ if ids.shape == seqs.shape:
+ names = np.hstack((ids, seqs))
+ else:
+ names = seqs
+ if targets and len(targets) == len(self.sequences):
+ target = np.array(targets)[:, np.newaxis]
+ data = np.hstack((names, self.descriptor, target))
+ else:
+ data = np.hstack((names, self.descriptor))
+ if not header:
+ featurenames = [['Sequence']] + self.featurenames
+ header = ', '.join([f[0] for f in featurenames])
+ np.savetxt(filename, data, delimiter=delimiter, fmt='%s', header=header)
+
+
+def load_scale(scalename):
+ """Method to load scale values for a given amino acid scale
+
+ :param scalename: amino acid scale name, for available scales see the
+ :class:`modlamp.descriptors.PeptideDescriptor()` documentation.
+ :return: amino acid scale values in dictionary format.
+ """
+ # predefined amino acid scales dictionary
+ scales = {
+ 'aasi': {'A': [1.89], 'C': [1.73], 'D': [3.13], 'E': [3.14], 'F': [1.53], 'G': [2.67], 'H': [3], 'I': [1.97],
+ 'K': [2.28], 'L': [1.74], 'M': [2.5], 'N': [2.33], 'P': [0.22], 'Q': [3.05], 'R': [1.91], 'S': [2.14],
+ 'T': [2.18], 'V': [2.37], 'W': [2], 'Y': [2.01]},
+ 'abhprk': {'A': [0, 0, 0, 0, 0, 0], 'C': [0, 0, 0, 0, 0, 0], 'D': [1, 0, 0, 1, 0, 0], 'E': [1, 0, 0, 1, 0, 0],
+ 'F': [0, 0, 1, 0, 1, 0], 'G': [0, 0, 0, 0, 0, 0], 'H': [0, 0, 0, 1, 1, 0], 'I': [0, 0, 1, 0, 0, 0],
+ 'K': [0, 1, 0, 1, 0, 0], 'L': [0, 0, 1, 0, 0, 0], 'M': [0, 0, 1, 0, 0, 0], 'N': [0, 0, 0, 1, 0, 0],
+ 'P': [0, 0, 0, 0, 0, 1], 'Q': [0, 0, 0, 1, 0, 0], 'R': [0, 1, 0, 1, 0, 0], 'S': [0, 0, 0, 1, 0, 0],
+ 'T': [0, 0, 0, 1, 0, 0], 'V': [0, 0, 1, 0, 0, 0], 'W': [0, 0, 1, 0, 1, 0], 'Y': [0, 0, 0, 1, 1, 0]},
+ 'argos': {'I': [0.77], 'F': [1.2], 'V': [0.14], 'L': [2.3], 'W': [0.07], 'M': [2.3], 'A': [0.64], 'G': [-0.48],
+ 'C': [0.25], 'Y': [-0.41], 'P': [-0.31], 'T': [-0.13], 'S': [-0.25], 'H': [-0.87], 'E': [-0.94],
+ 'N': [-0.89], 'Q': [-0.61], 'D': [-1], 'K': [-1], 'R': [-0.68]},
+ 'bulkiness': {'A': [0.443], 'C': [0.551], 'D': [0.453], 'E': [0.557], 'F': [0.898], 'G': [0], 'H': [0.563],
+ 'I': [0.985], 'K': [0.674], 'L': [0.985], 'M': [0.703], 'N': [0.516], 'P': [0.768], 'Q': [0.605],
+ 'R': [0.596], 'S': [0.332], 'T': [0.677], 'V': [0.995], 'W': [1], 'Y': [0.801]},
+ 'charge_phys': {'A': [0.], 'C': [-.1], 'D': [-1.], 'E': [-1.], 'F': [0.], 'G': [0.], 'H': [0.1],
+ 'I': [0.], 'K': [1.], 'L': [0.], 'M': [0.], 'N': [0.], 'P': [0.], 'Q': [0.],
+ 'R': [1.], 'S': [0.], 'T': [0.], 'V': [0.], 'W': [0.], 'Y': [0.]},
+ 'charge_acid': {'A': [0.], 'C': [-.1], 'D': [-1.], 'E': [-1.], 'F': [0.], 'G': [0.], 'H': [1.],
+ 'I': [0.], 'K': [1.], 'L': [0.], 'M': [0.], 'N': [0.], 'P': [0.], 'Q': [0.],
+ 'R': [1.], 'S': [0.], 'T': [0.], 'V': [0.], 'W': [0.], 'Y': [0.]},
+ 'cougar': {'A': [0.25, 0.62, 1.89], 'C': [0.208, 0.29, 1.73], 'D': [0.875, -0.9, 3.13],
+ 'E': [0.833, -0.74, 3.14], 'F': [0.042, 1.2, 1.53], 'G': [1, 0.48, 2.67], 'H': [0.083, -0.4, 3],
+ 'I': [0.667, 1.4, 1.97], 'K': [0.708, -1.5, 2.28], 'L': [0.292, 1.1, 1.74], 'M': [0, 0.64, 2.5],
+ 'N': [0.667, -0.78, 2.33], 'P': [0.875, 0.12, 0.22], 'Q': [0.792, -0.85, 3.05],
+ 'R': [0.958, -2.5, 1.91], 'S': [0.875, -0.18, 2.14], 'T': [0.583, -0.05, 2.18],
+ 'V': [0.375, 1.1, 2.37], 'W': [0.042, 0.81, 2], 'Y': [0.5, 0.26, 2.01]},
+ 'eisenberg': {'I': [1.4], 'F': [1.2], 'V': [1.1], 'L': [1.1], 'W': [0.81], 'M': [0.64], 'A': [0.62],
+ 'G': [0.48], 'C': [0.29], 'Y': [0.26], 'P': [0.12], 'T': [-0.05], 'S': [-0.18], 'H': [-0.4],
+ 'E': [-0.74], 'N': [-0.78], 'Q': [-0.85], 'D': [-0.9], 'K': [-1.5], 'R': [-2.5]},
+ 'ez': {'A': [-0.29, 10.22, 4.67], 'C': [0.95, 13.69, 5.77], 'D': [1.19, 14.25, 8.98], 'E': [1.3, 14.66, 4.16],
+ 'F': [-0.8, 19.67, 7.12], 'G': [-0.01, 13.86, 6], 'H': [0.75, 12.26, 2.77], 'I': [-0.56, 14.34, 10.69],
+ 'K': [1.66, 11.11, 2.09], 'L': [-0.64, 17.34, 8.61], 'M': [-0.28, 18.04, 7.13], 'N': [0.89, 12.78, 6.28],
+ 'P': [0.83, 18.09, 3.53], 'Q': [1.21, 10.46, 2.59], 'R': [1.55, 9.34, 4.68], 'S': [0.1, 13.86, 6],
+ 'T': [0.01, 13.86, 6], 'V': [-0.47, 11.35, 4.97], 'W': [-0.85, 11.65, 7.2], 'Y': [-0.42, 13.04, 6.2]},
+ 'flexibility': {'A': [0.25], 'C': [0.208], 'D': [0.875], 'E': [0.833], 'F': [0.042], 'G': [1], 'H': [0.083],
+ 'I': [0.667], 'K': [0.708], 'L': [0.292], 'M': [0.], 'N': [0.667], 'P': [0.875], 'Q': [0.792],
+ 'R': [0.958], 'S': [0.875], 'T': [0.583], 'V': [0.375], 'W': [0.042], 'Y': [0.5]},
+ 'grantham': {'A': [0, 8.1, 31], 'C': [2.75, 5.5, 55], 'D': [1.38, 13.0, 54], 'E': [0.92, 12.3, 83],
+ 'F': [0, 5.2, 132], 'G': [0.74, 9.0, 3], 'H': [0.58, 10.4, 96], 'I': [0, 5.2, 111],
+ 'K': [0.33, 11.3, 119], 'L': [0, 4.9, 111], 'M': [0, 5.7, 105], 'N': [1.33, 11.6, 56],
+ 'P': [0.39, 8.0, 32.5], 'Q': [0.89, 10.5, 85], 'R': [0.65, 10.5, 124], 'S': [1.42, 9.2, 32],
+ 'T': [0.71, 8.6, 61], 'V': [0, 5.9, 84], 'W': [0.13, 5.4, 170], 'Y': [0.20, 6.2, 136]},
+ 'gravy': {'I': [4.5], 'V': [4.2], 'L': [3.8], 'F': [2.8], 'C': [2.5], 'M': [1.9], 'A': [1.8], 'G': [-0.4],
+ 'T': [-0.7], 'W': [-0.9], 'S': [-0.8], 'Y': [-1.3], 'P': [-1.6], 'H': [-3.2], 'E': [-3.5],
+ 'Q': [-3.5], 'D': [-3.5], 'N': [-3.5], 'K': [-3.9], 'R': [-4.5]},
+ 'hopp-woods': {'A': [-0.5], 'C': [-1], 'D': [3], 'E': [3], 'F': [-2.5], 'G': [0], 'H': [-0.5], 'I': [-1.8],
+ 'K': [3], 'L': [-1.8], 'M': [-1.3], 'N': [0.2], 'P': [0], 'Q': [0.2], 'R': [3], 'S': [0.3],
+ 'T': [-0.4], 'V': [-1.5], 'W': [-3.4], 'Y': [-2.3]},
+ 'isaeci': {'A': [62.9, 0.05], 'C': [78.51, 0.15], 'D': [18.46, 1.25], 'E': [30.19, 1.31], 'F': [189.42, 0.14],
+ 'G': [19.93, 0.02], 'H': [87.38, 0.56], 'I': [149.77, 0.09], 'K': [102.78, 0.53], 'L': [154.35, 0.1],
+ 'M': [132.22, 0.34], 'N': [19.53, 1.36], 'P': [122.35, 0.16], 'Q': [17.87, 1.31], 'R': [52.98, 1.69],
+ 'S': [19.75, 0.56], 'T': [59.44, 0.65], 'V': [120.91, 0.07], 'W': [179.16, 1.08],
+ 'Y': [132.16, 0.72]},
+ 'janin': {'I': [1.2], 'F': [0.87], 'V': [1], 'L': [0.87], 'W': [0.59], 'M': [0.73], 'A': [0.59], 'G': [0.59],
+ 'C': [1.4], 'Y': [-0.4], 'P': [-0.26], 'T': [-0.12], 'S': [0.02], 'H': [0.02], 'E': [-0.83],
+ 'N': [-0.55], 'Q': [-0.83], 'D': [-0.69], 'K': [-2.4], 'R': [-1.8]},
+ 'kytedoolittle': {'I': [1.7], 'F': [1.1], 'V': [1.6], 'L': [1.4], 'W': [-0.14], 'M': [0.8], 'A': [0.77],
+ 'G': [0.03], 'C': [1], 'Y': [-0.27], 'P': [-0.37], 'T': [-0.07], 'S': [-0.1], 'H': [-0.91],
+ 'E': [-1], 'N': [-1], 'Q': [-1], 'D': [-1], 'K': [-1.1], 'R': [-1.3]},
+ 'levitt_alpha': {'A': [1.29], 'C': [1.11], 'D': [1.04], 'E': [1.44], 'F': [1.07], 'G': [0.56], 'H': [1.22],
+ 'I': [0.97], 'K': [1.23], 'L': [1.3], 'M': [1.47], 'N': [0.9], 'P': [0.52], 'Q': [1.27],
+ 'R': [0.96], 'S': [0.82], 'T': [0.82], 'V': [0.91], 'W': [0.99], 'Y': [0.72]},
+ 'mss': {'A': [13.02], 'C': [23.7067], 'D': [22.02], 'E': [20.0233], 'F': [23.5288], 'G': [1.01], 'H': [23.5283],
+ 'I': [22.3611], 'K': [18.9756], 'L': [19.6944], 'M': [21.92], 'N': [21.8567], 'P': [19.0242],
+ 'Q': [19.9689], 'R': [19.0434], 'S': [18.3533], 'T': [22.3567], 'V': [21.0267], 'W': [26.1975],
+ 'Y': [24.1954]},
+ 'msw': {'A': [-0.73, 0.2, -0.62], 'C': [-0.66, 0.26, -0.27], 'D': [0.11, -1, -0.96], 'E': [0.24, -0.39, -0.04],
+ 'F': [0.76, 0.85, -0.34], 'G': [-0.31, -0.28, -0.75], 'H': [0.84, 0.67, -0.78],
+ 'I': [-0.91, 0.83, -0.25], 'K': [-0.51, 0.08, 0.6], 'L': [-0.74, 0.72, -0.16], 'M': [-0.7, 1, -0.32],
+ 'N': [0.14, 0.2, -0.66], 'P': [-0.43, 0.73, -0.6], 'Q': [0.3, 1, -0.3], 'R': [-0.22, 0.27, 1],
+ 'S': [-0.8, 0.61, -1], 'T': [-0.58, 0.85, -0.89], 'V': [-1, 0.79, -0.58], 'W': [1, 0.98, -0.47],
+ 'Y': [0.97, 0.66, -0.16]},
+ 'pepcats': {'A': [1, 0, 0, 0, 0, 0], 'C': [1, 0, 1, 1, 0, 0], 'D': [0, 0, 1, 0, 0, 1], 'E': [0, 0, 1, 0, 0, 1],
+ 'F': [1, 1, 0, 0, 0, 0], 'G': [0, 0, 0, 0, 0, 0], 'H': [1, 1, 0, 1, 1, 0], 'I': [1, 0, 0, 0, 0, 0],
+ 'K': [1, 0, 0, 1, 1, 0], 'L': [1, 0, 0, 0, 0, 0], 'M': [1, 0, 1, 0, 0, 0], 'N': [0, 0, 1, 1, 0, 0],
+ 'P': [1, 0, 0, 0, 0, 0], 'Q': [0, 0, 1, 1, 0, 0], 'R': [1, 0, 0, 1, 1, 0], 'S': [0, 0, 1, 1, 0, 0],
+ 'T': [0, 0, 1, 1, 0, 0], 'V': [1, 0, 0, 0, 0, 0], 'W': [1, 1, 0, 1, 0, 0], 'Y': [1, 1, 1, 1, 0, 0]},
+ 'peparc': {'A': [1, 0, 0, 0, 0], 'C': [0, 1, 0, 0, 0], 'D': [0, 1, 0, 1, 0], 'E': [0, 1, 0, 1, 0],
+ 'F': [1, 0, 0, 0, 0], 'G': [0, 0, 0, 0, 0], 'H': [0, 1, 1, 0, 0], 'I': [1, 0, 0, 0, 0],
+ 'K': [0, 1, 1, 0, 0], 'L': [1, 0, 0, 0, 0], 'M': [1, 0, 0, 0, 0], 'N': [0, 1, 0, 0, 0],
+ 'P': [0, 0, 0, 0, 1], 'Q': [0, 1, 0, 0, 0], 'R': [0, 1, 1, 0, 0], 'S': [0, 1, 0, 0, 0],
+ 'T': [0, 1, 0, 0, 0], 'V': [1, 0, 0, 0, 0], 'W': [1, 0, 0, 0, 0], 'Y': [1, 0, 0, 0, 0]},
+ 'polarity': {'A': [0.395], 'C': [0.074], 'D': [1.], 'E': [0.914], 'F': [0.037], 'G': [0.506], 'H': [0.679],
+ 'I': [0.037], 'K': [0.79], 'L': [0.], 'M': [0.099], 'N': [0.827], 'P': [0.383], 'Q': [0.691],
+ 'R': [0.691], 'S': [0.531], 'T': [0.457], 'V': [0.123], 'W': [0.062], 'Y': [0.16]},
+ 'ppcali': {
+ 'A': [0.070781, 0.036271, 2.042, 0.083272, 0.69089, 0.15948, -0.80893, 0.24698, 0.86525, 0.68563, -0.24665,
+ 0.61314, -0.53343, -0.50878, -1.3646, 2.2679, -1.5644, -0.75043, -0.65875],
+ 'C': [0.61013, -0.93043, -0.85983, -2.2704, 1.5877, -2.0066, -0.30314, 1.2544, -0.2832, -1.2844, -0.73449,
+ -0.11235, -0.41152, -0.0050164, 0.28307, 0.20522, -0.021084, -0.15627, -0.32689],
+ 'D': [-1.3215, 0.24063, -0.032754, -0.37863, 1.2051, 1.0001, 2.1827, 0.19212, -0.60529, 0.37639, -0.46451,
+ -0.46788, 1.4077, -2.1661, 0.72604, -0.12332, -0.8243, -0.082989, 0.053476],
+ 'E': [-0.87713, 1.4905, 1.0755, 0.35944, 1.567, 0.41365, 1.0944, 0.72634, -0.74957, 0.038939, 0.075057,
+ 0.78637, -1.4543, 1.6667, -0.097439, -0.24293, 1.7687, 0.36174, -0.11585],
+ 'F': [1.3557, -0.10336, -0.4309, 0.41269, -0.083356, 0.83783, 0.095381, -0.65222, -0.3119, 0.43293, -1.0011,
+ -0.66855, -0.10242, 1.2066, 2.6234, 1.9981, -0.25016, 0.71979, 0.21569],
+ 'G': [-1.0818, -2.1561, 0.77082, -0.92747, -1.0748, 1.7997, -1.3708, 1.279, -1.2098, 0.46065, 0.43076,
+ 0.20037, -0.2302, 0.2646, 0.57149, -0.68432, 0.19341, -0.061606, -0.08071],
+ 'H': [-0.050161, 0.69246, -0.88397, -0.64601, 0.24622, 0.10487, -1.1317, -2.3661, -0.89918, 0.46391,
+ -0.62359, 2.5478, -0.34737, -0.52062, 0.17522, -0.88648, -0.4755, 0.023187, -0.28261],
+ 'I': [1.4829, -0.46435, 0.50189, 0.55724, -0.51535, -0.29914, 0.97236, -0.15793, -0.98246, -0.54347,
+ 0.97806, 0.37577, 1.618, 0.62323, -0.59359, -0.35483, -0.085017, 0.55825, -2.7542],
+ 'K': [-0.85344, 1.529, 0.27747, 0.32993, -1.1786, -0.16633, -1.0459, 0.44621, 0.41027, -2.5318, 0.91329,
+ 0.53385, 0.61417, -1.111, 1.1323, 0.95105, 0.76769, -0.016115, 0.054995],
+ 'L': [1.2857, 0.039488, 1.5378, 0.87969, -0.21419, 0.40389, -0.20426, -0.14351, 0.61024, -1.1927, -2.2149,
+ -0.84248, -0.5061, -0.48548, 0.10791, -2.1503, -0.12006, -0.60222, 0.26546],
+ 'M': [1.137, 0.64388, 0.13724, -0.2988, 1.2288, 0.24981, -1.6427, -0.75868, -0.54902, 1.0571, 1.272,
+ -1.9104, 0.70919, -0.93575, -0.6314, -0.079654, 1.634, -0.0021923, 0.49825],
+ 'N': [-1.084, -0.176, -0.47062, -0.92245, -0.32953, 0.74278, 0.34551, -1.4605, 0.25219, -1.2107, -0.59978,
+ -0.79183, 1.3268, 1.9839, -1.6137, 0.5333, 0.033889, -1.0331, 0.83019],
+ 'P': [-1.1823, -1.6911, -1.1331, 3.073, 1.1942, -0.93426, -0.72985, -0.042441, -0.19264, -0.21603, -0.1239,
+ 0.054016, 0.15241, -0.019691, -0.20543, 0.10206, 0.07671, -0.081968, 0.20348],
+ 'Q': [-0.57747, 0.97452, -0.077547, -0.0033488, 0.17184, -0.52537, -0.27362, -0.1366, 0.2057, -0.013066,
+ 1.8834, -1.2736, -0.84991, 1.0445, 0.69027, -1.2866, -2.6776, 0.1683, 0.086105],
+ 'R': [-0.62245, 1.545, -0.61966, 0.19057, -1.7485, -1.3909, -0.47526, 1.3938, -0.84556, 1.7344, -1.6516,
+ -0.52678, 0.6791, 0.24374, -0.62551, -0.0028271, -0.053884, 0.14926, -0.17232],
+ 'S': [-0.86409, -0.77147, 0.38542, -0.59389, -0.53313, -0.47585, 0.31966, -0.89716, 1.8029, 0.26431,
+ -0.23173, -0.37626, -0.47349, -0.42878, -0.47297, -0.079826, 0.57043, 3.2057, -0.18413],
+ 'T': [-0.33027, -0.57447, 0.18653, -0.28941, -0.62681, -1.0737, 0.80363, -0.59525, 1.8786, 1.3971, 0.63929,
+ 0.21281, -0.067048, 0.096271, 1.323, -0.36173, 1.2261, -2.2771, -0.65412],
+ 'V': [1.1675, -0.61554, 0.95405, 0.11662, -0.74473, -1.1482, 1.1309, 0.12079, -0.77171, 0.18597, 0.93442,
+ 1.201, 0.3826, -0.091573, -0.31269, 0.074367, -0.22946, 0.24322, 2.9836],
+ 'W': [1.1881, 0.43789, -1.7915, 0.138, 0.43088, 1.6467, -0.11987, 1.7369, 2.0818, 0.33122, 0.31829, 1.1586,
+ 0.67649, 0.30819, -0.55772, -0.54491, -0.17969, 0.24477, 0.38674],
+ 'Y': [0.54671, -0.1468, -1.5688, 0.19001, -1.2736, 0.66162, 1.1614, -0.18614, -0.70654, -0.43634, 0.44775,
+ -0.71366, -2.5907, -1.1649, -1.1576, 0.66572, 0.21019, -0.61016, -0.34844]},
+ 'refractivity': {'A': [0.102045615], 'C': [0.841053374], 'D': [0.282153774], 'E': [0.405831178],
+ 'F': [0.691276746], 'G': [0], 'H': [0.512814484], 'I': [0.448154244], 'K': [0.50058782],
+ 'L': [0.441570656], 'M': [0.508817305], 'N': [0.282153774], 'P': [0.256995062],
+ 'Q': [0.405831178], 'R': [0.626851634], 'S': [0.149306372], 'T': [0.258876087],
+ 'V': [0.327298378], 'W': [1], 'Y': [0.741359041]},
+ 't_scale': {'A': [-8.4, -8.01, -3.73, -3.65, -6.12, -1.59, 1.56],
+ 'C': [-2.44, -1.96, 0.93, -2.35, 1.31, 2.29, -1.52],
+ 'D': [-6.84, -0.94, 17.68, -0.03, 3.44, 9.07, 4.32],
+ 'E': [-6.5, 16.2, 17.28, 3.11, -4.75, -2.54, 4.72],
+ 'F': [21.59, -5.73, 1.03, -3.3, 2.64, -5.02, 1.7],
+ 'G': [-8.48, -10.37, -5.14, -6.51, -11.84, -3.6, 2.01],
+ 'H': [15.28, -3.67, 6.72, -6.38, 4.12, -1.55, -2.85],
+ 'I': [-2.97, 4.64, -0.77, 11, 3.26, -4.36, -7.88],
+ 'K': [2.7, 13.46, -14.03, -2.55, 2.77, 0.15, 3.19],
+ 'L': [2.61, 5.96, 1.97, 2.59, -4.77, -4.84, -5.44],
+ 'M': [3.38, 12.43, -4.77, 0.45, -1.55, -0.6, 3.26],
+ 'N': [-3.11, -1.22, 6.26, -9.38, 9.94, 7.66, -4.81],
+ 'P': [-5.35, -9.07, -1.52, -8.79, -8.73, 4.29, -9.91],
+ 'Q': [-5.31, 15.64, 8.44, 1.03, -4.32, -4.4, -0.52],
+ 'R': [-2.27, 18.9, -18.24, -3.47, 3.03, 6.64, 0.45],
+ 'S': [-15.88, -11.21, -2.44, -3.61, 3.46, -0.37, 8.98],
+ 'T': [-17.81, -13.64, -5.19, 10.57, 6.91, -4.43, 3.49],
+ 'V': [-5.8, -6.15, -2.26, 9.87, 5.28, -1.49, -7.54],
+ 'W': [21.68, -8.78, -2.53, 15.53, -8.15, 11.98, 3.23],
+ 'Y': [23.9, -6.47, 0.31, -4.14, 4.08, -7.28, 3.59]},
+ 'tm_tend': {'A': [0.38], 'C': [-0.3], 'D': [-3.27], 'E': [-2.9], 'F': [1.98], 'G': [-0.19], 'H': [-1.44],
+ 'I': [1.97], 'K': [-3.46], 'L': [1.82], 'M': [1.4], 'N': [-1.62], 'P': [-1.44], 'Q': [-1.84],
+ 'R': [-2.57], 'S': [-0.53], 'T': [-0.32], 'V': [1.46], 'W': [1.53], 'Y': [0.49]},
+ 'z3': {'A': [0.07, -1.73, 0.09], 'C': [0.71, -0.97, 4.13], 'D': [3.64, 1.13, 2.36], 'E': [3.08, 0.39, -0.07],
+ 'F': [-4.92, 1.3, 0.45], 'G': [2.23, -5.36, 0.3], 'H': [2.41, 1.74, 1.11], 'I': [-4.44, -1.68, -1.03],
+ 'K': [2.84, 1.41, -3.14], 'L': [-4.19, -1.03, -0.98], 'M': [-2.49, -0.27, -0.41],
+ 'N': [3.22, 1.45, 0.84], 'P': [-1.22, 0.88, 2.23], 'Q': [2.18, 0.53, -1.14], 'R': [2.88, 2.52, -3.44],
+ 'S': [1.96, -1.63, 0.57], 'T': [0.92, -2.09, -1.4], 'V': [-2.69, -2.53, -1.29], 'W': [-4.75, 3.65, 0.85],
+ 'Y': [-1.39, 2.32, 0.01]},
+ 'z5': {'A': [0.24, -2.32, 0.6, -0.14, 1.3], 'C': [0.84, -1.67, 3.71, 0.18, -2.65],
+ 'D': [3.98, 0.93, 1.93, -2.46, 0.75], 'E': [3.11, 0.26, -0.11, -3.04, -0.25],
+ 'F': [-4.22, 1.94, 1.06, 0.54, -0.62], 'G': [2.05, -4.06, 0.36, -0.82, -0.38],
+ 'H': [2.47, 1.95, 0.26, 3.9, 0.09], 'I': [-3.89, -1.73, -1.71, -0.84, 0.26],
+ 'K': [2.29, 0.89, -2.49, 1.49, 0.31], 'L': [-4.28, -1.3, -1.49, -0.72, 0.84],
+ 'M': [-2.85, -0.22, 0.47, 1.94, -0.98], 'N': [3.05, 1.62, 1.04, -1.15, 1.61],
+ 'P': [-1.66, 0.27, 1.84, 0.7, 2], 'Q': [1.75, 0.5, -1.44, -1.34, 0.66],
+ 'R': [3.52, 2.5, -3.5, 1.99, -0.17], 'S': [2.39, -1.07, 1.15, -1.39, 0.67],
+ 'T': [0.75, -2.18, -1.12, -1.46, -0.4], 'V': [-2.59, -2.64, -1.54, -0.85, -0.02],
+ 'W': [-4.36, 3.94, 0.59, 3.44, -1.59], 'Y': [-2.54, 2.44, 0.43, 0.04, -1.47]}
+ }
+ if scalename == 'all':
+ d = {'I': [], 'F': [], 'V': [], 'L': [], 'W': [], 'M': [], 'A': [], 'G': [], 'C': [], 'Y': [], 'P': [],
+ 'T': [], 'S': [], 'H': [], 'E': [], 'N': [], 'Q': [], 'D': [], 'K': [], 'R': []}
+ for scale in scales.keys():
+ for k, v in scales[scale].items():
+ d[k].extend(v)
+ return 'all', d
+
+ elif scalename == 'instability':
+ d = {
+ "A": {"A": 1.0, "C": 44.94, "E": 1.0, "D": -7.49, "G": 1.0, "F": 1.0, "I": 1.0, "H": -7.49, "K": 1.0,
+ "M": 1.0, "L": 1.0, "N": 1.0, "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, "T": 1.0, "W": 1.0, "V": 1.0,
+ "Y": 1.0},
+ "C": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 20.26, "G": 1.0, "F": 1.0, "I": 1.0, "H": 33.6, "K": 1.0,
+ "M": 33.6, "L": 20.26, "N": 1.0, "Q": -6.54, "P": 20.26, "S": 1.0, "R": 1.0, "T": 33.6, "W": 24.68,
+ "V": -6.54, "Y": 1.0},
+ "E": {"A": 1.0, "C": 44.94, "E": 33.6, "D": 20.26, "G": 1.0, "F": 1.0, "I": 20.26, "H": -6.54, "K": 1.0,
+ "M": 1.0, "L": 1.0, "N": 1.0, "Q": 20.26, "P": 20.26, "S": 20.26, "R": 1.0, "T": 1.0, "W": -14.03,
+ "V": 1.0, "Y": 1.0},
+ "D": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, "G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0, "K": -7.49,
+ "M": 1.0, "L": 1.0, "N": 1.0, "Q": 1.0, "P": 1.0, "S": 20.26, "R": -6.54, "T": -14.03, "W": 1.0,
+ "V": 1.0, "Y": 1.0},
+ "G": {"A": -7.49, "C": 1.0, "E": -6.54, "D": 1.0, "G": 13.34, "F": 1.0, "I": -7.49, "H": 1.0, "K": -7.49,
+ "M": 1.0, "L": 1.0, "N": -7.49, "Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0, "T": -7.49, "W": 13.34,
+ "V": 1.0, "Y": -7.49},
+ "F": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 13.34, "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, "K": -14.03,
+ "M": 1.0, "L": 1.0, "N": 1.0, "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, "T": 1.0, "W": 1.0, "V": 1.0,
+ "Y": 33.601},
+ "I": {"A": 1.0, "C": 1.0, "E": 44.94, "D": 1.0, "G": 1.0, "F": 1.0, "I": 1.0, "H": 13.34, "K": -7.49,
+ "M": 1.0, "L": 20.26, "N": 1.0, "Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0, "T": 1.0, "W": 1.0,
+ "V": -7.49, "Y": 1.0},
+ "H": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, "G": -9.37, "F": -9.37, "I": 44.94, "H": 1.0, "K": 24.68,
+ "M": 1.0, "L": 1.0, "N": 24.68, "Q": 1.0, "P": -1.88, "S": 1.0, "R": 1.0, "T": -6.54, "W": -1.88,
+ "V": 1.0, "Y": 44.94},
+ "K": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, "G": -7.49, "F": 1.0, "I": -7.49, "H": 1.0, "K": 1.0,
+ "M": 33.6, "L": -7.49, "N": 1.0, "Q": 24.64, "P": -6.54, "S": 1.0, "R": 33.6, "T": 1.0, "W": 1.0,
+ "V": -7.49, "Y": 1.0},
+ "M": {"A": 13.34, "C": 1.0, "E": 1.0, "D": 1.0, "G": 1.0, "F": 1.0, "I": 1.0, "H": 58.28, "K": 1.0,
+ "M": -1.88, "L": 1.0, "N": 1.0, "Q": -6.54, "P": 44.94, "S": 44.94, "R": -6.54, "T": -1.88, "W": 1.0,
+ "V": 1.0, "Y": 24.68},
+ "L": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, "K": -7.49, "M": 1.0,
+ "L": 1.0, "N": 1.0, "Q": 33.6, "P": 20.26, "S": 1.0, "R": 20.26, "T": 1.0, "W": 24.68, "V": 1.0,
+ "Y": 1.0},
+ "N": {"A": 1.0, "C": -1.88, "E": 1.0, "D": 1.0, "G": -14.03, "F": -14.03, "I": 44.94, "H": 1.0, "K": 24.68,
+ "M": 1.0, "L": 1.0, "N": 1.0, "Q": -6.54, "P": -1.88, "S": 1.0, "R": 1.0, "T": -7.49, "W": -9.37,
+ "V": 1.0, "Y": 1.0},
+ "Q": {"A": 1.0, "C": -6.54, "E": 20.26, "D": 20.26, "G": 1.0, "F": -6.54, "I": 1.0, "H": 1.0, "K": 1.0,
+ "M": 1.0, "L": 1.0, "N": 1.0, "Q": 20.26, "P": 20.26, "S": 44.94, "R": 1.0, "T": 1.0, "W": 1.0,
+ "V": -6.54, "Y": -6.54},
+ "P": {"A": 20.26, "C": -6.54, "E": 18.38, "D": -6.54, "G": 1.0, "F": 20.26, "I": 1.0, "H": 1.0, "K": 1.0,
+ "M": -6.54, "L": 1.0, "N": 1.0, "Q": 20.26, "P": 20.26, "S": 20.26, "R": -6.54, "T": 1.0, "W": -1.88,
+ "V": 20.26, "Y": 1.0},
+ "S": {"A": 1.0, "C": 33.6, "E": 20.26, "D": 1.0, "G": 1.0, "F": 1.0, "I": 1.0, "H": 1.0, "K": 1.0, "M": 1.0,
+ "L": 1.0, "N": 1.0, "Q": 20.26, "P": 44.94, "S": 20.26, "R": 20.26, "T": 1.0, "W": 1.0, "V": 1.0,
+ "Y": 1.0},
+ "R": {"A": 1.0, "C": 1.0, "E": 1.0, "D": 1.0, "G": -7.49, "F": 1.0, "I": 1.0, "H": 20.26, "K": 1.0,
+ "M": 1.0, "L": 1.0, "N": 13.34, "Q": 20.26, "P": 20.26, "S": 44.94, "R": 58.28, "T": 1.0, "W": 58.28,
+ "V": 1.0, "Y": -6.54},
+ "T": {"A": 1.0, "C": 1.0, "E": 20.26, "D": 1.0, "G": -7.49, "F": 13.34, "I": 1.0, "H": 1.0, "K": 1.0,
+ "M": 1.0, "L": 1.0, "N": -14.03, "Q": -6.54, "P": 1.0, "S": 1.0, "R": 1.0, "T": 1.0, "W": -14.03,
+ "V": 1.0, "Y": 1.0},
+ "W": {"A": -14.03, "C": 1.0, "E": 1.0, "D": 1.0, "G": -9.37, "F": 1.0, "I": 1.0, "H": 24.68, "K": 1.0,
+ "M": 24.68, "L": 13.34, "N": 13.34, "Q": 1.0, "P": 1.0, "S": 1.0, "R": 1.0, "T": -14.03, "W": 1.0,
+ "V": -7.49, "Y": 1.0},
+ "V": {"A": 1.0, "C": 1.0, "E": 1.0, "D": -14.03, "G": -7.49, "F": 1.0, "I": 1.0, "H": 1.0, "K": -1.88,
+ "M": 1.0, "L": 1.0, "N": 1.0, "Q": 1.0, "P": 20.26, "S": 1.0, "R": 1.0, "T": -7.49, "W": 1.0,
+ "V": 1.0, "Y": -6.54},
+ "Y": {"A": 24.68, "C": 1.0, "E": -6.54, "D": 24.68, "G": -7.49, "F": 1.0, "I": 1.0, "H": 13.34, "K": 1.0,
+ "M": 44.94, "L": 1.0, "N": 1.0, "Q": 1.0, "P": 13.34, "S": 1.0, "R": -15.91, "T": -7.49, "W": -9.37,
+ "V": 1.0, "Y": 13.34}}
+ return 'instability', d
+
+ else:
+ return scalename, scales[scalename]
+
+
+def read_fasta(inputfile):
+ """Method for loading sequences from a FASTA formatted file into :py:attr:`sequences` & :py:attr:`names`.
+ This method is used by the base class :class:`modlamp.descriptors.PeptideDescriptor` if the input is a FASTA file.
+
+ :param inputfile: .fasta file with sequences and headers to read
+ :return: list of sequences in the attribute :py:attr:`sequences` with corresponding sequence names in
+ :py:attr:`names`.
+ """
+ names = list() # list for storing names
+ sequences = list() # list for storing sequences
+ seq = str()
+ with open(inputfile) as f:
+ all = f.readlines()
+ last = all[-1]
+ for line in all:
+ if line.startswith('>'):
+ names.append(line.split(' ')[0][1:].strip()) # add FASTA name without description as molecule name
+ sequences.append(seq.strip())
+ seq = str()
+ elif line == last:
+ seq += line.strip() # remove potential white space
+ sequences.append(seq.strip())
+ else:
+ seq += line.strip() # remove potential white space
+ return sequences[1:], names
+
+
+def save_fasta(filename, sequences, names=None):
+ """Method for saving sequences in the instance :py:attr:`sequences` to a file in FASTA format.
+
+ :param filename: {str} output filename (ending .fasta)
+ :param sequences: {list} sequences to be saved to file
+ :param names: {list} whether sequence names from self.names should be saved as sequence identifiers
+ :return: a FASTA formatted file containing the generated sequences
+ """
+ if os.path.exists(filename):
+ os.remove(filename) # remove outputfile, it it exists
+
+ with open(filename, 'w') as o:
+ for n, seq in enumerate(sequences):
+ if names:
+ o.write('>' + str(names[n]) + '\n')
+ else:
+ o.write('>Seq_' + str(n) + '\n')
+ o.write(seq + '\n')
+
+
+def aa_weights():
+ """Function holding molecular weight data on all natural amino acids.
+
+ :return: dictionary with amino acid letters and corresponding weights
+
+ .. versionadded:: v2.4.1
+ """
+ weights = {'A': 89.093, 'C': 121.158, 'D': 133.103, 'E': 147.129, 'F': 165.189, 'G': 75.067,
+ 'H': 155.155, 'I': 131.173, 'K': 146.188, 'L': 131.173, 'M': 149.211, 'N': 132.118,
+ 'P': 115.131, 'Q': 146.145, 'R': 174.20, 'S': 105.093, 'T': 119.119, 'V': 117.146,
+ 'W': 204.225, 'Y': 181.189}
+ return weights
+
+
+def count_aas(seq, scale='relative'):
+ """Function to count the amino acids occuring in a given sequence.
+
+ :param seq: {str} amino acid sequence
+ :param scale: {'absolute' or 'relative'} defines whether counts or frequencies are given for each AA
+ :return: {dict} dictionary with amino acids as keys and their counts in the sequence as values.
+ """
+ if seq == '': # error if len(seq) == 0
+ seq = ' '
+ aas = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
+ scl = 1.
+ if scale == 'relative':
+ scl = len(seq)
+ aa = {a: (float(seq.count(a)) / scl) for a in aas}
+ aa = collections.OrderedDict(sorted(list(aa.items())))
+ return aa
+
+
+def count_ngrams(seq, n):
+ """Function to count the n-grams of an amino acid sequence. N can be one integer or a list of integers
+
+ :param seq: {str} amino acid sequence
+ :param n: {int or list of ints} defines whether counts or frequencies are given for each AA
+ :return: {dict} dictionary with n-grams as keys and their counts in the sequence as values.
+ """
+ if seq == '':
+ seq = ' '
+ if isinstance(n, int):
+ n = [n]
+ ngrams = list()
+ for i in n:
+ ngrams.extend([seq[j:j+i] for j in range(len(seq) - (i-1))])
+ counts = {g: (seq.count(g)) for g in set(ngrams)}
+ counts = collections.OrderedDict(sorted(counts.items(), key=operator.itemgetter(1), reverse=True))
+ return counts
+
+
+def aa_energies():
+ """Function holding free energies of transfer between cyclohexane and water for all natural amino acids.
+ H. G. Boman, D. Wade, I. a Boman, B. Wåhlin, R. B. Merrifield, *FEBS Lett*. **1989**, *259*, 103–106.
+
+ :return: dictionary with amino acid letters and corresponding energies.
+ """
+ energies = {'L': -4.92, 'I': -4.92, 'V': -4.04, 'F': -2.98, 'M': -2.35, 'W': -2.33, 'A': -1.81, 'C': -1.28,
+ 'G': -0.94, 'Y': 0.14, 'T': 2.57, 'S': 3.40, 'H': 4.66, 'Q': 5.54, 'K': 5.55, 'N': 6.64, 'E': 6.81,
+ 'D': 8.72, 'R': 14.92, 'P': 0.}
+ return energies
+
+
+def ngrams_apd():
+ """Function returning the most frequent 2-, 3- and 4-grams from all sequences in the `APD3
+ `_, version August 2016 with 2727 sequences.
+ For all 2, 3 and 4grams, all possible ngrams were generated from all sequences and the top 50 most frequent
+ assembled into a list. Finally, leading and tailing spaces were striped and duplicates as well as ngrams containing
+ spaces were removed.
+
+ :return: numpy.array containing most frequent ngrams
+ """
+ return np.array(['AGK', 'CKI', 'RR', 'YGGG', 'LSGL', 'RG', 'YGGY', 'PRP', 'LGGG',
+ 'GV', 'GT', 'GS', 'GR', 'IAG', 'GG', 'GF', 'GC', 'GGYG', 'GA', 'GL',
+ 'GK', 'GI', 'IPC', 'KAA', 'LAK', 'GLGG', 'GGLG', 'CKIT', 'GAGK',
+ 'LLSG', 'LKK', 'FLP', 'LSG', 'SCK', 'LLS', 'GETC', 'VLG', 'GKLL',
+ 'LLG', 'C', 'KCKI', 'G', 'VGK', 'CSC', 'TKKC', 'GCS', 'GKA', 'IGK',
+ 'GESC', 'KVCY', 'KKL', 'KKI', 'KKC', 'LGGL', 'GLL', 'CGE', 'GGYC',
+ 'GLLS', 'GLF', 'AKK', 'GKAA', 'ESCV', 'GLP', 'CGES', 'PCGE', 'FL',
+ 'CGET', 'GLW', 'KGAA', 'KAAL', 'GGY', 'GGG', 'IKG', 'LKG', 'GGL',
+ 'CK', 'GTC', 'CG', 'SKKC', 'CS', 'CR', 'KC', 'AGKA', 'KA', 'KG',
+ 'LKCK', 'SCKL', 'KK', 'KI', 'KN', 'KL', 'SK', 'KV', 'SL', 'SC',
+ 'SG', 'AAA', 'VAK', 'AAL', 'AAK', 'GGGG', 'KNVA', 'GGGL', 'GYG',
+ 'LG', 'LA', 'LL', 'LK', 'LS', 'LP', 'GCSC', 'TC', 'GAA', 'AA', 'VA',
+ 'VC', 'AG', 'VG', 'AI', 'AK', 'VL', 'AL', 'TPGC', 'IK', 'IA', 'IG',
+ 'YGG', 'LGK', 'CSCK', 'GYGG', 'LGG', 'KGA'])
+
+
+def aa_formulas():
+ """
+ Function returning the molecular formulas of all amino acids. All amino acids are considered in the neutral form
+ (uncharged).
+ """
+ formulas = {'A': {'C': 3, 'H': 7, 'N': 1, 'O': 2, 'S': 0},
+ 'C': {'C': 3, 'H': 7, 'N': 1, 'O': 2, 'S': 1},
+ 'D': {'C': 4, 'H': 7, 'N': 1, 'O': 4, 'S': 0},
+ 'E': {'C': 5, 'H': 9, 'N': 1, 'O': 4, 'S': 0},
+ 'F': {'C': 9, 'H': 11, 'N': 1, 'O': 2, 'S': 0},
+ 'G': {'C': 2, 'H': 5, 'N': 1, 'O': 2, 'S': 0},
+ 'H': {'C': 6, 'H': 9, 'N': 3, 'O': 2, 'S': 0},
+ 'I': {'C': 6, 'H': 13, 'N': 1, 'O': 2, 'S': 0},
+ 'K': {'C': 6, 'H': 14, 'N': 2, 'O': 2, 'S': 0},
+ 'L': {'C': 6, 'H': 13, 'N': 1, 'O': 2, 'S': 0},
+ 'M': {'C': 5, 'H': 11, 'N': 1, 'O': 2, 'S': 1},
+ 'N': {'C': 4, 'H': 8, 'N': 2, 'O': 3, 'S': 0},
+ 'P': {'C': 5, 'H': 9, 'N': 1, 'O': 2, 'S': 0},
+ 'Q': {'C': 5, 'H': 10, 'N': 2, 'O': 3, 'S': 0},
+ 'R': {'C': 6, 'H': 14, 'N': 4, 'O': 2, 'S': 0},
+ 'S': {'C': 3, 'H': 7, 'N': 1, 'O': 3, 'S': 0},
+ 'T': {'C': 4, 'H': 9, 'N': 1, 'O': 3, 'S': 0},
+ 'V': {'C': 5, 'H': 11, 'N': 1, 'O': 2, 'S': 0},
+ 'W': {'C': 11, 'H': 12, 'N': 2, 'O': 2, 'S': 0},
+ 'Y': {'C': 9, 'H': 11, 'N': 1, 'O': 3, 'S': 0}
+ }
+ return formulas
diff -r 000000000000 -r 9caa9aa44fd8 cpt_helical_wheel/plotWheels/descriptors.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_helical_wheel/plotWheels/descriptors.py Tue Jul 05 05:21:34 2022 +0000
@@ -0,0 +1,1097 @@
+# -*- coding: utf-8 -*-
+"""
+.. currentmodule:: modlamp.descriptors
+
+.. moduleauthor:: modlab Alex Mueller ETH Zurich
+
+This module incorporates different classes to calculate peptide descriptor values. The following classes are available:
+
+============================= ============================================================================
+Class Characteristics
+============================= ============================================================================
+:py:class:`GlobalDescriptor` Global one-dimensional peptide descriptors calculated from the AA sequence.
+:py:class:`PeptideDescriptor` AA scale based global or convoluted descriptors (auto-/cross-correlated).
+============================= ============================================================================
+
+.. seealso:: :class:`modlamp.core.BaseDescriptor` from which the classes in :mod:`modlamp.descriptors` inherit.
+"""
+
+import sys
+
+import numpy as np
+from scipy import stats
+from sklearn.externals.joblib import Parallel, delayed
+
+from plotWheels.core import BaseDescriptor, load_scale, count_aas, aa_weights, aa_energies, aa_formulas
+
+__author__ = "Alex Müller, Gisela Gabernet"
+__docformat__ = "restructuredtext en"
+
+
+def _one_autocorr(seq, window, scale):
+ """Private function used for calculating auto-correlated descriptors for 1 given sequence, window and an AA scale.
+ This function is used by the :py:func:`calculate_autocorr` method of :py:class:`PeptideDescriptor`.
+
+ :param seq: {str} amino acid sequence to calculate descriptor for
+ :param window: {int} correlation-window size
+ :param scale: {str} amino acid scale to be used to calculate descriptor
+ :return: {numpy.array} calculated descriptor data
+ """
+ try:
+ m = list() # list of lists to store translated sequence values
+ for l in range(len(seq)): # translate AA sequence into values
+ m.append(scale[str(seq[l])])
+ # auto-correlation in defined sequence window
+ seqdesc = list()
+ for dist in range(window): # for all correlation distances
+ for val in range(len(scale['A'])): # for all features of the descriptor scale
+ valsum = list()
+ cntr = 0.
+ for pos in range(len(seq)): # for every position in the sequence
+ if (pos + dist) < len(seq): # check if corr distance is possible at that sequence position
+ cntr += 1 # counter to scale sum
+ valsum.append(m[pos][val] * m[pos + dist][val])
+ seqdesc.append(sum(valsum) / cntr) # append scaled correlation distance values
+ return seqdesc
+ except ZeroDivisionError:
+ print("ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" % (window, seq))
+
+
+def _one_crosscorr(seq, window, scale):
+ """Private function used for calculating cross-correlated descriptors for 1 given sequence, window and an AA scale.
+ This function is used by the :py:func:`calculate_crosscorr` method of :py:class:`PeptideDescriptor`.
+
+ :param seq: {str} amino acid sequence to calculate descriptor for
+ :param window: {int} correlation-window size
+ :param scale: {str} amino acid scale to be used to calculate descriptor
+ :return: {numpy.array} calculated descriptor data
+ """
+ try:
+ m = list() # list of lists to store translated sequence values
+ for l in range(len(seq)): # translate AA sequence into values
+ m.append(scale[str(seq[l])])
+ # auto-correlation in defined sequence window
+ seqdesc = list()
+ for val in range(len(scale['A'])): # for all features of the descriptor scale
+ for cc in range(len(scale['A'])): # for every feature cross correlation
+ if (val + cc) < len(scale['A']): # check if corr distance is in range of the num of features
+ for dist in range(window): # for all correlation distances
+ cntr = float()
+ valsum = list()
+ for pos in range(len(seq)): # for every position in the sequence
+ if (pos + dist) < len(seq): # check if corr distance is possible at that sequence pos
+ cntr += 1 # counter to scale sum
+ valsum.append(m[pos][val] * m[pos + dist][val + cc])
+ seqdesc.append(sum(valsum) / cntr) # append scaled correlation distance values
+ return seqdesc
+ except ZeroDivisionError:
+ print("ERROR!\nThe chosen correlation window % i is larger than the sequence %s !" % (window, seq))
+
+
+def _one_arc(seq, modality, scale):
+ """ Privat function used for calculating arc descriptors for one sequence and AA scale. This function is used by
+ :py:func:`calculate_arc` method method of :py:class:`PeptideDescriptor`.
+
+ :param seq: {str} amino acid sequence to calculate descriptor for
+ :param scale: {str} amino acid scale to be used to calculate descriptor
+ :return: {numpy.array} calculated descriptor data
+ """
+ desc_mat = []
+ for aa in seq:
+ desc_mat.append(scale[aa])
+ desc_mat = np.asarray(desc_mat)
+
+ # Check descriptor dimension
+ desc_dim = desc_mat.shape[1]
+
+ # list to store descriptor values for all windows
+ allwindows_arc = []
+
+ if len(seq) > 18:
+ window = 18
+ # calculates number of windows in sequence
+ num_windows = len(seq) - window
+ else:
+ window = len(seq)
+ num_windows = 1
+
+ # loop through all windows
+ for j in range(num_windows):
+ # slices descriptor matrix into current window
+ window_mat = desc_mat[j:j + window, :]
+
+ # defines order of amino acids in helical projection
+ order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7]
+
+ # orders window descriptor matrix into helical projection order
+ ordered = []
+ for pos in order:
+ try:
+ ordered.append(window_mat[pos, :])
+ except:
+ # for sequences of len < 18 adding dummy vector with 2s, length of descriptor dimensions
+ ordered.append([2] * desc_dim)
+ ordered = np.asarray(ordered)
+
+ window_arc = []
+
+ # loop through pharmacophoric features
+ for m in range(desc_dim):
+ all_arcs = [] # stores all arcs that can be found of a pharmacophoric feature
+ arc = 0
+
+ for n in range(18): # for all positions in helix, regardless of sequence length
+ if ordered[n, m] == 0: # if position does not contain pharmacophoric feature
+ all_arcs.append(arc) # append previous arc to all arcs list
+ arc = 0 # arc is initialized
+ elif ordered[n, m] == 1: # if position contains pharmacophoric feature(PF), elongate arc by 20°
+ arc += 20
+ elif ordered[n, m] == 2: # if position doesn't contain amino acid:
+ if ordered[n - 1, m] == 1: # if previous position contained PF add 10°
+ arc += 10
+ elif ordered[n - 1, m] == 0: # if previous position didn't contain PF don't add anything
+ arc += 0
+ elif ordered[
+ n - 2, m] == 1: # if previous position is empty then check second previous for PF
+ arc += 10
+ if n == 17: # if we are at the last position check for position n=0 instead of next position.
+ if ordered[0, m] == 1: # if it contains PF add 10° extra
+ arc += 10
+ else: # if next position contains PF add 10° extra
+ if ordered[n + 1, m] == 1:
+ arc += 10
+ elif ordered[n + 1, m] == 0:
+ arc += 0
+ else: # if next position is empty check for 2nd next position
+ if n == 16:
+ if ordered[0, m] == 1:
+ arc += 10
+ else:
+ if ordered[n + 2, m] == 1:
+ arc += 10
+
+ all_arcs.append(arc)
+ if not arc == 360:
+ arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together
+ all_arcs = [arc0] + all_arcs[1:]
+
+ window_arc.append(np.max(all_arcs)) # append to window arcs the maximum arc of this PF
+ allwindows_arc.append(window_arc) # append all PF arcs of this window
+
+ allwindows_arc = np.asarray(allwindows_arc)
+
+ if modality == 'max':
+ final_arc = np.max(allwindows_arc, axis=0) # calculate maximum / mean arc along all windows
+ elif modality == 'mean':
+ final_arc = np.mean(allwindows_arc, axis=0)
+ else:
+ print('modality is unknown, please choose between "max" and "mean"\n.')
+ sys.exit()
+ return final_arc
+
+
+def _charge(seq, ph=7.0, amide=False):
+ """Calculates charge of a single sequence. The method used is first described by Bjellqvist. In the case of
+ amidation, the value for the 'Cterm' pKa is 15 (and Cterm is added to the pos_pks dictionary.
+ The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed).
+
+ **pos_pks** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
+
+ **neg_pks** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}
+
+ :param ph: {float} pH at which to calculate peptide charge.
+ :param amide: {boolean} whether the sequences have an amidated C-terminus.
+ :return: {array} descriptor values in the attribute :py:attr:`descriptor
+ """
+
+ if amide:
+ pos_pks = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
+ neg_pks = {'Cterm': 15., 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}
+ else:
+ pos_pks = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
+ neg_pks = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}
+
+ aa_content = count_aas(seq, scale='absolute')
+ aa_content['Nterm'] = 1.0
+ aa_content['Cterm'] = 1.0
+ pos_charge = 0.0
+ for aa, pK in pos_pks.items():
+ c_r = 10 ** (pK - ph)
+ partial_charge = c_r / (c_r + 1.0)
+ pos_charge += aa_content[aa] * partial_charge
+ neg_charge = 0.0
+ for aa, pK in neg_pks.items():
+ c_r = 10 ** (ph - pK)
+ partial_charge = c_r / (c_r + 1.0)
+ neg_charge += aa_content[aa] * partial_charge
+ return round(pos_charge - neg_charge, 3)
+
+
+class GlobalDescriptor(BaseDescriptor):
+ """
+ Base class for global, non-amino acid scale dependant descriptors. The following descriptors can be calculated by
+ the **methods** linked below:
+
+ - `Sequence Length `_
+ - `Molecular Formula `_
+ - `Molecular Weight `_
+ - `Sequence Charge `_
+ - `Charge Density `_
+ - `Isoelectric Point `_
+ - `Instability Index `_
+ - `Aromaticity `_
+ - `Aliphatic Index `_
+ - `Boman Index `_
+ - `Hydrophobic Ratio `_
+ - `all of the above `_
+ """
+
+ def length(self, append=False):
+ """
+ Method to calculate the length (total AA count) of every sequence in the attribute :py:attr:`sequences`.
+
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: array of sequence lengths in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> desc = GlobalDescriptor(['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'])
+ >>> desc.length()
+ >>> desc.descriptor
+ array([[ 8.], [12.], [12.]])
+ """
+ desc = []
+ for seq in self.sequences:
+ desc.append(float(len(seq.strip())))
+ desc = np.asarray(desc).reshape(len(desc), 1)
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ self.featurenames.append('Length')
+ else:
+ self.descriptor = np.array(desc)
+ self.featurenames = ['Length']
+
+ def formula(self, amide=False, append=False):
+ """Method to calculate the molecular formula of every sequence in the attribute :py:attr:`sequences`.
+
+ :param amide: {boolean} whether the sequences are C-terminally amidated.
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: array of molecular formulas {str} in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> desc = GlobalDescriptor(['KADSFLSADGHSADFSLDKKLKERL', 'ERTILSDFPQWWFASLDFLNC', 'ACDEFGHIKLMNPQRSTVWY'])
+ >>> desc.formula(amide=True)
+ >>> for v in desc.descriptor:
+ ... print(v[0])
+ C122 H197 N35 O39
+ C121 H168 N28 O33 S
+ C106 H157 N29 O30 S2
+
+ .. seealso:: :py:func:`modlamp.core.aa_formulas()`
+
+ .. versionadded:: v2.7.6
+ """
+ desc = []
+ formulas = aa_formulas()
+ for seq in self.sequences:
+ f = {'C': 0, 'H': 0, 'N': 0, 'O': 0, 'S': 0}
+ for aa in seq: # loop over all AAs
+ for k in f.keys():
+ f[k] += formulas[aa][k]
+
+ # substract H2O for every peptide bond
+ f['H'] -= 2 * (len(seq) - 1)
+ f['O'] -= (len(seq) - 1)
+
+ if amide: # add C-terminal amide --> replace OH with NH2
+ f['O'] -= 1
+ f['H'] += 1
+ f['N'] += 1
+
+ if f['S'] != 0:
+ val = 'C%s H%s N%s O%s %s%s' % (f['C'], f['H'], f['N'], f['O'], 'S', f['S'])
+ else:
+ val = 'C%s H%s N%s O%s' % (f['C'], f['H'], f['N'], f['O'])
+
+ desc.append([val])
+
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ self.featurenames.append('Formula')
+ else:
+ self.descriptor = np.array(desc)
+ self.featurenames = ['Formula']
+
+ def calculate_MW(self, amide=False, append=False):
+ """Method to calculate the molecular weight [g/mol] of every sequence in the attribute :py:attr:`sequences`.
+
+ :param amide: {boolean} whether the sequences are C-terminally amidated (subtracts 0.95 from the MW).
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: array of descriptor values in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> desc = GlobalDescriptor('IAESFKGHIPL')
+ >>> desc.calculate_MW(amide=True)
+ >>> desc.descriptor
+ array([[ 1210.43]])
+
+ .. seealso:: :py:func:`modlamp.core.aa_weights()`
+
+ .. versionchanged:: v2.1.5 amide option added
+ """
+ desc = []
+ weights = aa_weights()
+ for seq in self.sequences:
+ mw = []
+ for aa in seq: # sum over aa weights
+ mw.append(weights[aa])
+ desc.append(round(sum(mw) - 18.015 * (len(seq) - 1), 2)) # sum over AA MW and subtract H20 MW for every
+ # peptide bond
+ desc = np.asarray(desc).reshape(len(desc), 1)
+ if amide: # if sequences are amidated, subtract 0.98 from calculated MW (OH - NH2)
+ desc = [d - 0.98 for d in desc]
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ self.featurenames.append('MW')
+ else:
+ self.descriptor = np.array(desc)
+ self.featurenames = ['MW']
+
+ def calculate_charge(self, ph=7.0, amide=False, append=False):
+ """Method to overall charge of every sequence in the attribute :py:attr:`sequences`.
+
+ The method used is first described by Bjellqvist. In the case of amidation, the value for the 'Cterm' pKa is 15
+ (and Cterm is added to the pos_pKs dictionary.
+ The pKa scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed).
+
+ **pos_pKs** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
+
+ **neg_pKs** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}
+
+ :param ph: {float} ph at which to calculate peptide charge.
+ :param amide: {boolean} whether the sequences have an amidated C-terminus.
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: array of descriptor values in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> desc = GlobalDescriptor('KLAKFGKRSELVALSG')
+ >>> desc.calculate_charge(ph=7.4, amide=True)
+ >>> desc.descriptor
+ array([[ 3.989]])
+ """
+
+ desc = []
+ for seq in self.sequences:
+ desc.append(_charge(seq, ph, amide)) # calculate charge with helper function
+ desc = np.asarray(desc).reshape(len(desc), 1)
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ self.featurenames.append('Charge')
+ else:
+ self.descriptor = np.array(desc)
+ self.featurenames = ['Charge']
+
+ def charge_density(self, ph=7.0, amide=False, append=False):
+ """Method to calculate the charge density (charge / MW) of every sequences in the attributes :py:attr:`sequences`
+
+ :param ph: {float} pH at which to calculate peptide charge.
+ :param amide: {boolean} whether the sequences have an amidated C-terminus.
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: array of descriptor values in the attribute :py:attr:`descriptor`.
+ :Example:
+
+ >>> desc = GlobalDescriptor('GNSDLLIEQRTLLASDEF')
+ >>> desc.charge_density(ph=6, amide=True)
+ >>> desc.descriptor
+ array([[-0.00097119]])
+ """
+ self.calculate_charge(ph, amide)
+ charges = self.descriptor
+ self.calculate_MW(amide)
+ masses = self.descriptor
+ desc = charges / masses
+ desc = np.asarray(desc).reshape(len(desc), 1)
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ self.featurenames.append('ChargeDensity')
+ else:
+ self.descriptor = np.array(desc)
+ self.featurenames = ['ChargeDensity']
+
+ def isoelectric_point(self, amide=False, append=False):
+ """
+ Method to calculate the isoelectric point of every sequence in the attribute :py:attr:`sequences`.
+ The pK scale is extracted from: http://www.hbcpnetbase.com/ (CRC Handbook of Chemistry and Physics, 96th ed).
+
+ **pos_pKs** = {'Nterm': 9.38, 'K': 10.67, 'R': 12.10, 'H': 6.04}
+
+ **neg_pKs** = {'Cterm': 2.15, 'D': 3.71, 'E': 4.15, 'C': 8.14, 'Y': 10.10}
+
+ :param amide: {boolean} whether the sequences have an amidated C-terminus.
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: array of descriptor values in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> desc = GlobalDescriptor('KLFDIKFGHIPQRST')
+ >>> desc.isoelectric_point()
+ >>> desc.descriptor
+ array([[ 10.6796875]])
+ """
+ ph, ph1, ph2 = float(), float(), float()
+ desc = []
+ for seq in self.sequences:
+
+ # Bracket between ph1 and ph2
+ ph = 7.0
+ charge = _charge(seq, ph, amide)
+ if charge > 0.0:
+ ph1 = ph
+ charge1 = charge
+ while charge1 > 0.0:
+ ph = ph1 + 1.0
+ charge = _charge(seq, ph, amide)
+ if charge > 0.0:
+ ph1 = ph
+ charge1 = charge
+ else:
+ ph2 = ph
+ break
+ else:
+ ph2 = ph
+ charge2 = charge
+ while charge2 < 0.0:
+ ph = ph2 - 1.0
+ charge = _charge(seq, ph, amide)
+ if charge < 0.0:
+ ph2 = ph
+ charge2 = charge
+ else:
+ ph1 = ph
+ break
+ # Bisection
+ while ph2 - ph1 > 0.0001 and charge != 0.0:
+ ph = (ph1 + ph2) / 2.0
+ charge = _charge(seq, ph, amide)
+ if charge > 0.0:
+ ph1 = ph
+ else:
+ ph2 = ph
+ desc.append(ph)
+ desc = np.asarray(desc).reshape(len(desc), 1)
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ self.featurenames.append('pI')
+ else:
+ self.descriptor = np.array(desc)
+ self.featurenames = ['pI']
+
+ def instability_index(self, append=False):
+ """
+ Method to calculate the instability of every sequence in the attribute :py:attr:`sequences`.
+ The instability index is a prediction of protein stability based on the amino acid composition.
+ ([1] K. Guruprasad, B. V Reddy, M. W. Pandit, Protein Eng. 1990, 4, 155–161.)
+
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: array of descriptor values in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> desc = GlobalDescriptor('LLASMNDLLAKRST')
+ >>> desc.instability_index()
+ >>> desc.descriptor
+ array([[ 63.95714286]])
+ """
+
+ desc = []
+ dimv = load_scale('instability')[1]
+ for seq in self.sequences:
+ stabindex = float()
+ for i in range(len(seq) - 1):
+ stabindex += dimv[seq[i]][seq[i+1]]
+ desc.append((10.0 / len(seq)) * stabindex)
+ desc = np.asarray(desc).reshape(len(desc), 1)
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ self.featurenames.append('InstabilityInd')
+ else:
+ self.descriptor = np.array(desc)
+ self.featurenames = ['InstabilityInd']
+
+ def aromaticity(self, append=False):
+ """
+ Method to calculate the aromaticity of every sequence in the attribute :py:attr:`sequences`.
+ According to Lobry, 1994, it is simply the relative frequency of Phe+Trp+Tyr.
+
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: array of descriptor values in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> desc = GlobalDescriptor('GLFYWRFFLQRRFLYWW')
+ >>> desc.aromaticity()
+ >>> desc.descriptor
+ array([[ 0.52941176]])
+ """
+ desc = []
+ for seq in self.sequences:
+ f = seq.count('F')
+ w = seq.count('W')
+ y = seq.count('Y')
+ desc.append(float(f + w + y) / len(seq))
+ desc = np.asarray(desc).reshape(len(desc), 1)
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ self.featurenames.append('Aromaticity')
+ else:
+ self.descriptor = np.array(desc)
+ self.featurenames = ['Aromaticity']
+
+ def aliphatic_index(self, append=False):
+ """
+ Method to calculate the aliphatic index of every sequence in the attribute :py:attr:`sequences`.
+ According to Ikai, 1980, the aliphatic index is a measure of thermal stability of proteins and is dependant
+ on the relative volume occupied by aliphatic amino acids (A,I,L & V).
+ ([1] A. Ikai, J. Biochem. 1980, 88, 1895–1898.)
+
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: array of descriptor values in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> desc = GlobalDescriptor('KWLKYLKKLAKLVK')
+ >>> desc.aliphatic_index()
+ >>> desc.descriptor
+ array([[ 139.28571429]])
+ """
+ desc = []
+ aa_dict = aa_weights()
+ for seq in self.sequences:
+ d = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa
+ d = {k: (float(d[k]) / len(seq)) * 100 for k in d.keys()} # get mole percent of all AA
+ desc.append(d['A'] + 2.9 * d['V'] + 3.9 * (d['I'] + d['L'])) # formula for calculating the AI (Ikai, 1980)
+ desc = np.asarray(desc).reshape(len(desc), 1)
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ self.featurenames.append('AliphaticInd')
+ else:
+ self.descriptor = np.array(desc)
+ self.featurenames = ['AliphaticInd']
+
+ def boman_index(self, append=False):
+ """Method to calculate the boman index of every sequence in the attribute :py:attr:`sequences`.
+ According to Boman, 2003, the boman index is a measure for protein-protein interactions and is calculated by
+ summing over all amino acid free energy of transfer [kcal/mol] between water and cyclohexane,[2] followed by
+ dividing by sequence length.
+ ([1] H. G. Boman, D. Wade, I. a Boman, B. Wåhlin, R. B. Merrifield, *FEBS Lett*. **1989**, *259*, 103–106.
+ [2] A. Radzick, R. Wolfenden, *Biochemistry* **1988**, *27*, 1664–1670.)
+
+ .. seealso:: :py:func:`modlamp.core.aa_energies()`
+
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: array of descriptor values in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> desc = GlobalDescriptor('GLFDIVKKVVGALGSL')
+ >>> desc.boman_index()
+ >>> desc.descriptor
+ array([[-1.011875]])
+ """
+ d = aa_energies()
+ desc = []
+ for seq in self.sequences:
+ val = []
+ for a in seq:
+ val.append(d[a])
+ desc.append(sum(val) / len(val))
+ desc = np.asarray(desc).reshape(len(desc), 1)
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ self.featurenames.append('BomanInd')
+ else:
+ self.descriptor = np.array(desc)
+ self.featurenames = ['BomanInd']
+
+ def hydrophobic_ratio(self, append=False):
+ """
+ Method to calculate the hydrophobic ratio of every sequence in the attribute :py:attr:`sequences`, which is the
+ relative frequency of the amino acids **A,C,F,I,L,M & V**.
+
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: array of descriptor values in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> desc = GlobalDescriptor('VALLYWRTVLLAIII')
+ >>> desc.hydrophobic_ratio()
+ >>> desc.descriptor
+ array([[ 0.73333333]])
+ """
+ desc = []
+ aa_dict = aa_weights()
+ for seq in self.sequences:
+ pa = {aa: seq.count(aa) for aa in aa_dict.keys()} # count aa
+ # formula for calculating the AI (Ikai, 1980):
+ desc.append((pa['A'] + pa['C'] + pa['F'] + pa['I'] + pa['L'] + pa['M'] + pa['V']) / float(len(seq)))
+ desc = np.asarray(desc).reshape(len(desc), 1)
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ self.featurenames.append('HydrophRatio')
+ else:
+ self.descriptor = np.array(desc)
+ self.featurenames = ['HydrophRatio']
+
+ def calculate_all(self, ph=7.4, amide=True):
+ """Method combining all global descriptors and appending them into the feature matrix in the attribute
+ :py:attr:`descriptor`.
+
+ :param ph: {float} pH at which to calculate peptide charge
+ :param amide: {boolean} whether the sequences have an amidated C-terminus.
+ :return: array of descriptor values in the attribute :py:attr:`descriptor`
+ :Example:
+
+ >>> desc = GlobalDescriptor('AFGHFKLKKLFIFGHERT')
+ >>> desc.calculate_all(amide=True)
+ >>> desc.featurenames
+ ['Length', 'MW', 'ChargeDensity', 'pI', 'InstabilityInd', 'Aromaticity', 'AliphaticInd', 'BomanInd', 'HydRatio']
+ >>> desc.descriptor
+ array([[ 18., 2.17559000e+03, 1.87167619e-03, 1.16757812e+01, ... 1.10555556e+00, 4.44444444e-01]])
+ >>> desc.save_descriptor('/path/to/outputfile.csv') # save the descriptor data (with feature names header)
+ """
+
+ # This is a strange way of doing it. However, the append=True option excludes length and charge, no idea why!
+ fn = []
+ self.length() # sequence length
+ l = self.descriptor
+ fn.extend(self.featurenames)
+ self.calculate_MW(amide=amide) # molecular weight
+ mw = self.descriptor
+ fn.extend(self.featurenames)
+ self.calculate_charge(ph=ph, amide=amide) # net charge
+ c = self.descriptor
+ fn.extend(self.featurenames)
+ self.charge_density(ph=ph, amide=amide) # charge density
+ cd = self.descriptor
+ fn.extend(self.featurenames)
+ self.isoelectric_point(amide=amide) # pI
+ pi = self.descriptor
+ fn.extend(self.featurenames)
+ self.instability_index() # instability index
+ si = self.descriptor
+ fn.extend(self.featurenames)
+ self.aromaticity() # global aromaticity
+ ar = self.descriptor
+ fn.extend(self.featurenames)
+ self.aliphatic_index() # aliphatic index
+ ai = self.descriptor
+ fn.extend(self.featurenames)
+ self.boman_index() # Boman index
+ bi = self.descriptor
+ fn.extend(self.featurenames)
+ self.hydrophobic_ratio() # Hydrophobic ratio
+ hr = self.descriptor
+ fn.extend(self.featurenames)
+
+ self.descriptor = np.concatenate((l, mw, c, cd, pi, si, ar, ai, bi, hr), axis=1)
+ self.featurenames = fn
+
+
+class PeptideDescriptor(BaseDescriptor):
+ """Base class for peptide descriptors. The following **amino acid descriptor scales** are available for descriptor
+ calculation:
+
+ - **AASI** (An amino acid selectivity index scale for helical antimicrobial peptides, *[1] D. Juretić, D. Vukicević, N. Ilić, N. Antcheva, A. Tossi, J. Chem. Inf. Model. 2009, 49, 2873–2882.*)
+ - **ABHPRK** (modlabs inhouse physicochemical feature scale (Acidic, Basic, Hydrophobic, Polar, aRomatic, Kink-inducer)
+ - **argos** (Argos hydrophobicity amino acid scale, *[2] Argos, P., Rao, J. K. M. & Hargrave, P. A., Eur. J. Biochem. 2005, 128, 565–575.*)
+ - **bulkiness** (Amino acid side chain bulkiness scale, *[3] J. M. Zimmerman, N. Eliezer, R. Simha, J. Theor. Biol. 1968, 21, 170–201.*)
+ - **charge_phys** (Amino acid charge at pH 7.0 - Hystidine charge +0.1.)
+ - **charge_acid** (Amino acid charge at acidic pH - Hystidine charge +1.0.)
+ - **cougar** (modlabs inhouse selection of global peptide descriptors)
+ - **eisenberg** (the Eisenberg hydrophobicity consensus amino acid scale, *[4] D. Eisenberg, R. M. Weiss, T. C. Terwilliger, W. Wilcox, Faraday Symp. Chem. Soc. 1982, 17, 109.*)
+ - **Ez** (potential that assesses energies of insertion of amino acid side chains into lipid bilayers, *[5] A. Senes, D. C. Chadi, P. B. Law, R. F. S. Walters, V. Nanda, W. F. DeGrado, J. Mol. Biol. 2007, 366, 436–448.*)
+ - **flexibility** (amino acid side chain flexibilitiy scale, *[6] R. Bhaskaran, P. K. Ponnuswamy, Int. J. Pept. Protein Res. 1988, 32, 241–255.*)
+ - **grantham** (amino acid side chain composition, polarity and molecular volume, *[8] Grantham, R. Science. 185, 862–864 (1974).*)
+ - **gravy** (GRAVY hydrophobicity amino acid scale, *[9] J. Kyte, R. F. Doolittle, J. Mol. Biol. 1982, 157, 105–132.*)
+ - **hopp-woods** (Hopp-Woods amino acid hydrophobicity scale,*[10] T. P. Hopp, K. R. Woods, Proc. Natl. Acad. Sci. 1981, 78, 3824–3828.*)
+ - **ISAECI** (Isotropic Surface Area (ISA) and Electronic Charge Index (ECI) of amino acid side chains, *[11] E. R. Collantes, W. J. Dunn, J. Med. Chem. 1995, 38, 2705–2713.*)
+ - **janin** (Janin hydrophobicity amino acid scale, *[12] J. L. Cornette, K. B. Cease, H. Margalit, J. L. Spouge, J. A. Berzofsky, C. DeLisi, J. Mol. Biol. 1987, 195, 659–685.*)
+ - **kytedoolittle** (Kyte & Doolittle hydrophobicity amino acid scale, *[13] J. Kyte, R. F. Doolittle, J. Mol. Biol. 1982, 157, 105–132.*)
+ - **levitt_alpha** (Levitt amino acid alpha-helix propensity scale, extracted from http://web.expasy.org/protscale. *[14] M. Levitt, Biochemistry 1978, 17, 4277-4285.*)
+ - **MSS** (A graph-theoretical index that reflects topological shape and size of amino acid side chains, *[15] C. Raychaudhury, A. Banerjee, P. Bag, S. Roy, J. Chem. Inf. Comput. Sci. 1999, 39, 248–254.*)
+ - **MSW** (Amino acid scale based on a PCA of the molecular surface based WHIM descriptor (MS-WHIM), extended to natural amino acids, *[16] A. Zaliani, E. Gancia, J. Chem. Inf. Comput. Sci 1999, 39, 525–533.*)
+ - **pepArc** (modlabs pharmacophoric feature scale, dimensions are: hydrophobicity, polarity, positive charge, negative charge, proline.)
+ - **pepcats** (modlabs pharmacophoric feature based PEPCATS scale, *[17] C. P. Koch, A. M. Perna, M. Pillong, N. K. Todoroff, P. Wrede, G. Folkers, J. A. Hiss, G. Schneider, PLoS Comput. Biol. 2013, 9, e1003088.*)
+ - **polarity** (Amino acid polarity scale, *[18] J. M. Zimmerman, N. Eliezer, R. Simha, J. Theor. Biol. 1968, 21, 170–201.*)
+ - **PPCALI** (modlabs inhouse scale derived from a PCA of 143 amino acid property scales, *[19] C. P. Koch, A. M. Perna, M. Pillong, N. K. Todoroff, P. Wrede, G. Folkers, J. A. Hiss, G. Schneider, PLoS Comput. Biol. 2013, 9, e1003088.*)
+ - **refractivity** (Relative amino acid refractivity values, *[20] T. L. McMeekin, M. Wilensky, M. L. Groves, Biochem. Biophys. Res. Commun. 1962, 7, 151–156.*)
+ - **t_scale** (A PCA derived scale based on amino acid side chain properties calculated with 6 different probes of the GRID program, *[21] M. Cocchi, E. Johansson, Quant. Struct. Act. Relationships 1993, 12, 1–8.*)
+ - **TM_tend** (Amino acid transmembrane propensity scale, extracted from http://web.expasy.org/protscale, *[22] Zhao, G., London E. Protein Sci. 2006, 15, 1987-2001.*)
+ - **z3** (The original three dimensional Z-scale, *[23] S. Hellberg, M. Sjöström, B. Skagerberg, S. Wold, J. Med. Chem. 1987, 30, 1126–1135.*)
+ - **z5** (The extended five dimensional Z-scale, *[24] M. Sandberg, L. Eriksson, J. Jonsson, M. Sjöström, S. Wold, J. Med. Chem. 1998, 41, 2481–2491.*)
+
+ Further, amino acid scale independent methods can be calculated with help of the :class:`GlobalDescriptor` class.
+
+ """
+
+ def __init__(self, seqs, scalename='Eisenberg'):
+ """
+ :param seqs: a .fasta file with sequences, a list of sequences or a single sequence as string to calculate the
+ descriptor values for.
+ :param scalename: {str} name of the amino acid scale (one of the given list above) used to calculate the
+ descriptor values
+ :return: initialized attributes :py:attr:`sequences`, :py:attr:`names` and dictionary :py:attr:`scale` with
+ amino acid scale values of the scale name in :py:attr:`scalename`.
+ :Example:
+
+ >>> AMP = PeptideDescriptor('KLLKLLKKLLKLLK','pepcats')
+ >>> AMP.sequences
+ ['KLLKLLKKLLKLLK']
+ >>> seqs = PeptideDescriptor('/Path/to/file.fasta', 'eisenberg') # load sequences from .fasta file
+ >>> seqs.sequences
+ ['AFDGHLKI','KKLQRSDLLRTK','KKLASCNNIPPR'...]
+ """
+ super(PeptideDescriptor, self).__init__(seqs)
+ self.scalename, self.scale = load_scale(scalename.lower())
+ self.all_moms = list() # for passing hydrophobic moments to calculate_profile
+ self.all_globs = list() # for passing global to calculate_profile
+
+ def load_scale(self, scalename):
+ """Method to load amino acid values from a given scale
+
+ :param scalename: {str} name of the amino acid scale to be loaded.
+ :return: loaded amino acid scale values in a dictionary in the attribute :py:attr:`scale`.
+
+ .. seealso:: :func:`modlamp.core.load_scale()`
+ """
+ self.scalename, self.scale = load_scale(scalename.lower())
+
+ def calculate_autocorr(self, window, append=False):
+ """Method for auto-correlating the amino acid values for a given descriptor scale
+
+ :param window: {int} correlation window for descriptor calculation in a sliding window approach
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: calculated descriptor numpy.array in the attribute :py:attr:`descriptor`.
+ :Example:
+
+ >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','PPCALI')
+ >>> AMP.calculate_autocorr(7)
+ >>> AMP.descriptor
+ array([[ 1.28442339e+00, 1.29025116e+00, 1.03240901e+00, .... ]])
+ >>> AMP.descriptor.shape
+ (1, 133)
+
+ .. versionchanged:: v.2.3.0
+ """
+ desc = Parallel(n_jobs=-1)(delayed(_one_autocorr)(seq, window, self.scale) for seq in self.sequences)
+
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ else:
+ self.descriptor = np.array(desc)
+
+ def calculate_crosscorr(self, window, append=False):
+ """Method for cross-correlating the amino acid values for a given descriptor scale
+
+ :param window: {int} correlation window for descriptor calculation in a sliding window approach
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: calculated descriptor numpy.array in the attribute :py:attr:`descriptor`.
+ :Example:
+
+ >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','pepcats')
+ >>> AMP.calculate_crosscorr(7)
+ >>> AMP.descriptor
+ array([[ 0.6875 , 0.46666667, 0.42857143, 0.61538462, 0.58333333, ... ]])
+ >>> AMP.descriptor.shape
+ (1, 147)
+ """
+ desc = Parallel(n_jobs=-1)(delayed(_one_crosscorr)(seq, window, self.scale) for seq in self.sequences)
+
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ else:
+ self.descriptor = np.array(desc)
+
+ def calculate_moment(self, window=1000, angle=100, modality='max', append=False):
+ """Method for calculating the maximum or mean moment of the amino acid values for a given descriptor scale and
+ window.
+
+ :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the
+ window, the length of the sequence is taken. So if the default window of 1000 is chosen, for all sequences
+ shorter than 1000, the **global** hydrophobic moment will be calculated. Otherwise, the maximal
+ hydrophiobic moment for the chosen window size found in the sequence will be returned.
+ :param angle: {int} angle in which to calculate the moment. **100** for alpha helices, **180** for beta sheets.
+ :param modality: {'all', 'max' or 'mean'} Calculate respectively maximum or mean hydrophobic moment. If all,
+ moments for all windows are returned.
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: Calculated descriptor as a numpy.array in the attribute :py:attr:`descriptor` and all possible global
+ values in :py:attr:`all_moms` (needed for the :py:func:`calculate_profile` method)
+ :Example:
+
+ >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL', 'eisenberg')
+ >>> AMP.calculate_moment()
+ >>> AMP.descriptor
+ array([[ 0.48790226]])
+ """
+ if self.scale['A'] == list:
+ print('\n Descriptor moment calculation is only possible for one dimensional descriptors.\n')
+
+ else:
+ desc = []
+ for seq in self.sequences:
+ wdw = min(window, len(seq)) # if sequence is shorter than window, take the whole sequence instead
+ mtrx = []
+ mwdw = []
+
+ for aa in range(len(seq)):
+ mtrx.append(self.scale[str(seq[aa])])
+
+ for i in range(len(mtrx) - wdw + 1):
+ mwdw.append(sum(mtrx[i:i + wdw], []))
+
+ mwdw = np.asarray(mwdw)
+ rads = angle * (np.pi / 180) * np.asarray(range(wdw)) # calculate actual moment (radial)
+ vcos = (mwdw * np.cos(rads)).sum(axis=1)
+ vsin = (mwdw * np.sin(rads)).sum(axis=1)
+ moms = np.sqrt(vsin ** 2 + vcos ** 2) / wdw
+
+ if modality == 'max': # take window with maximal value
+ moment = np.max(moms)
+ elif modality == 'mean': # take average value over all windows
+ moment = np.mean(moms)
+ elif modality == 'all':
+ moment = moms
+ else:
+ print('\nERROR!\nModality parameter is wrong, please choose between "all", "max" and "mean".\n')
+ return
+ desc.append(moment)
+ self.all_moms.append(moms)
+
+ desc = np.asarray(desc).reshape(len(desc), 1) # final descriptor array
+
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ else:
+ self.descriptor = np.array(desc)
+
+ def calculate_global(self, window=1000, modality='max', append=False):
+ """Method for calculating a global / window averaging descriptor value of a given AA scale
+
+ :param window: {int} amino acid window in which to calculate the moment. If the sequence is shorter than the
+ window, the length of the sequence is taken.
+ :param modality: {'max' or 'mean'} Calculate respectively maximum or mean hydrophobic moment.
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: Calculated descriptor as a numpy.array in the attribute :py:attr:`descriptor` and all possible global
+ values in :py:attr:`all_globs` (needed for the :py:func:`calculate_profile` method)
+ :Example:
+
+ >>> AMP = PeptideDescriptor('GLFDIVKKVVGALGSL','eisenberg')
+ >>> AMP.calculate_global(window=1000, modality='max')
+ >>> AMP.descriptor
+ array([[ 0.44875]])
+ """
+ desc = list()
+ for n, seq in enumerate(self.sequences):
+ wdw = min(window, len(seq)) # if sequence is shorter than window, take the whole sequence instead
+ mtrx = []
+ mwdw = []
+
+ for l in range(len(seq)): # translate AA sequence into values
+ mtrx.append(self.scale[str(seq[l])])
+
+ for i in range(len(mtrx) - wdw + 1):
+ mwdw.append(sum(mtrx[i:i + wdw], [])) # list of all the values for the different windows
+
+ mwdw = np.asarray(mwdw)
+ glob = np.sum(mwdw, axis=1) / float(wdw)
+ outglob = float()
+
+ if modality in ['max', 'mean']:
+ if modality == 'max':
+ outglob = np.max(glob) # returned moment will be the maximum of all windows
+ elif modality == 'mean':
+ outglob = np.mean(glob) # returned moment will be the mean of all windows
+ else:
+ print('Modality parameter is wrong, please choose between "max" and "mean"\n.')
+ return
+ desc.append(outglob)
+ self.all_globs.append(glob)
+
+ desc = np.asarray(desc).reshape(len(desc), 1)
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ else:
+ self.descriptor = np.array(desc)
+
+ def calculate_profile(self, prof_type='uH', window=7, append=False):
+ """Method for calculating hydrophobicity or hydrophobic moment profiles for given sequences and fitting for
+ slope and intercept. The hydrophobicity scale used is "eisenberg"
+
+ :param prof_type: prof_type of profile, available: 'H' for hydrophobicity or 'uH' for hydrophobic moment
+ :param window: {int} size of sliding window used (odd-numbered).
+ :param append: {boolean} whether the produced descriptor values should be appended to the existing ones in the
+ attribute :py:attr:`descriptor`.
+ :return: Fitted slope and intercept of calculated profile for every given sequence in the attribute
+ :py:attr:`descriptor`.
+ :Example:
+
+ >>> AMP = PeptideDescriptor('KLLKLLKKVVGALG','kytedoolittle')
+ >>> AMP.calculate_profile(prof_type='H')
+ >>> AMP.descriptor
+ array([[ 0.03731293, 0.19246599]])
+ """
+ if prof_type == 'uH':
+ self.calculate_moment(window=window)
+ y_vals = self.all_moms
+ elif prof_type == 'H':
+ self.calculate_global(window=window)
+ y_vals = self.all_globs
+ else:
+ print('prof_type parameter is unknown, choose "uH" for hydrophobic moment or "H" for hydrophobicity\n.')
+ sys.exit()
+
+ desc = list()
+ for n, seq in enumerate(self.sequences):
+ x_vals = range(len(seq))[int((window - 1) / 2):-int((window - 1) / 2)]
+ if len(seq) <= window:
+ slope, intercept, r_value, p_value, std_err = [0, 0, 0, 0, 0]
+ else:
+ slope, intercept, r_value, p_value, std_err = stats.linregress(x_vals, y_vals[n])
+ desc.append([slope, intercept])
+
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ else:
+ self.descriptor = np.array(desc)
+
+ def calculate_arc(self, modality="max", append=False):
+ """ Method for calculating property arcs as seen in the helical wheel plot. Use for binary amino acid scales only.
+
+ :param modality: modality of the arc to calculate, to choose between "max" and "mean".
+ :param append: if true, append to current descriptor stored in the descriptor attribute.
+ :return: calculated descriptor as numpy.array in the descriptor attribute.
+
+ :Example:
+
+ >>> arc = PeptideDescriptor("KLLKLLKKLLKLLK", scalename="peparc")
+ >>> arc.calculate_arc(modality="max", append=False)
+ >>> arc.descriptor
+ array([[200, 160, 160, 0, 0]])
+ """
+ desc = Parallel(n_jobs=-1)(delayed(_one_arc)(seq, modality, self.scale) for seq in self.sequences)
+
+ # Converts each of the amino acids to descriptor vector
+ for seq in self.sequences:
+
+ # desc_mat = []
+ # for aa in seq:
+ # desc_mat.append(self.scale[aa])
+ # desc_mat = np.asarray(desc_mat)
+ #
+ # # Check descriptor dimension
+ # desc_dim = desc_mat.shape[1]
+ #
+ # # list to store descriptor values for all windows
+ # allwindows_arc = []
+ #
+ # if len(seq) > 18:
+ # window = 18
+ # # calculates number of windows in sequence
+ # num_windows = len(seq) - window
+ # else:
+ # window = len(seq)
+ # num_windows = 1
+ #
+ # # loop through all windows
+ # for j in range(num_windows):
+ # # slices descriptor matrix into current window
+ # window_mat = desc_mat[j:j + window, :]
+ #
+ # # defines order of amino acids in helical projection
+ # order = [0, 11, 4, 15, 8, 1, 12, 5, 16, 9, 2, 13, 6, 17, 10, 3, 14, 7]
+ #
+ # # orders window descriptor matrix into helical projection order
+ # ordered = []
+ # for pos in order:
+ # try:
+ # ordered.append(window_mat[pos, :])
+ # except:
+ # # for sequences of len < 18 adding dummy vector with 2s, length of descriptor dimensions
+ # ordered.append([2] * desc_dim)
+ # ordered = np.asarray(ordered)
+ #
+ # window_arc = []
+ #
+ # # loop through pharmacophoric features
+ # for m in range(desc_dim):
+ # all_arcs = [] # stores all arcs that can be found of a pharmacophoric feature
+ # arc = 0
+ #
+ # for n in range(18): # for all positions in helix, regardless of sequence length
+ # if ordered[n, m] == 0: # if position does not contain pharmacophoric feature
+ # all_arcs.append(arc) # append previous arc to all arcs list
+ # arc = 0 # arc is initialized
+ # elif ordered[n, m] == 1: # if position contains pharmacophoric feature(PF), elongate arc by 20°
+ # arc += 20
+ # elif ordered[n, m] == 2: # if position doesn't contain amino acid:
+ # if ordered[n - 1, m] == 1: # if previous position contained PF add 10°
+ # arc += 10
+ # elif ordered[n - 1, m] == 0: # if previous position didn't contain PF don't add anything
+ # arc += 0
+ # elif ordered[
+ # n - 2, m] == 1: # if previous position is empty then check second previous for PF
+ # arc += 10
+ # if n == 17: # if we are at the last position check for position n=0 instead of next position.
+ # if ordered[0, m] == 1: # if it contains PF add 10° extra
+ # arc += 10
+ # else: # if next position contains PF add 10° extra
+ # if ordered[n + 1, m] == 1:
+ # arc += 10
+ # elif ordered[n + 1, m] == 0:
+ # arc += 0
+ # else: # if next position is empty check for 2nd next position
+ # if n == 16:
+ # if ordered[0, m] == 1:
+ # arc += 10
+ # else:
+ # if ordered[n + 2, m] == 1:
+ # arc += 10
+ #
+ # all_arcs.append(arc)
+ # if not arc == 360:
+ # arc0 = all_arcs.pop() + all_arcs[0] # join first and last arc together
+ # all_arcs = [arc0] + all_arcs[1:]
+ #
+ # window_arc.append(np.max(all_arcs)) # append to window arcs the maximum arc of this PF
+ # allwindows_arc.append(window_arc) # append all PF arcs of this window
+ #
+ # allwindows_arc = np.asarray(allwindows_arc)
+ #
+ # if modality == 'max':
+ # final_arc = np.max(allwindows_arc, axis=0) # calculate maximum / mean arc along all windows
+ # elif modality == 'mean':
+ # final_arc = np.mean(allwindows_arc, axis=0)
+ # else:
+ # print('modality is unknown, please choose between "max" and "mean"\n.')
+ # sys.exit()
+
+ if append:
+ self.descriptor = np.hstack((self.descriptor, np.array(desc)))
+ else:
+ self.descriptor = np.array(desc)
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 9caa9aa44fd8 cpt_helical_wheel/plotWheels/helical_wheel.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_helical_wheel/plotWheels/helical_wheel.py Tue Jul 05 05:21:34 2022 +0000
@@ -0,0 +1,233 @@
+import matplotlib
+matplotlib.use('Agg')
+
+import matplotlib.lines as lines
+import matplotlib.patches as patches
+import matplotlib.pyplot as plt
+#from mpl_toolkits.mplot3d import Axes3D
+import numpy as np
+from scipy.stats.kde import gaussian_kde
+
+from plotWheels.core import load_scale
+from plotWheels.descriptors import PeptideDescriptor
+
+def helical_wheel(sequence, colorcoding='rainbow', text_color=None,
+ lineweights=True, filename=None, seq=False, moment=False,
+ seqRange=1, t_size=32, rot=float(90), dpi=150, numbering=True):
+ """A function to project a given peptide sequence onto a helical wheel plot. It can be useful to illustrate the
+ properties of alpha-helices, like positioning of charged and hydrophobic residues along the sequence.
+
+ :param sequence: {str} the peptide sequence for which the helical wheel should be drawn
+ :param colorcoding: {str} the color coding to be used, available: *rainbow*, *charge*, *polar*, *simple*,
+ *amphipathic*, *custom_input*, *none*
+ :param lineweights: {boolean} defines whether connection lines decrease in thickness along the sequence
+ :param filename: {str} filename where to save the plot. *default = None* --> show the plot
+ :param seq: {bool} whether the amino acid sequence should be plotted as a title
+ :param moment: {bool} whether the Eisenberg hydrophobic moment should be calculated and plotted
+ :param seqRange: {int} starting value of residue location in sequence
+ :param t_size: {int} text size
+ :param rot: {float} rotation by radians --> converted to degrees.
+ :param dpi: {int} dpi parameter for saved files
+ :return: a helical wheel projection plot of the given sequence (interactively or in **filename**)
+ :Example:
+
+ >>> helical_wheel('GLFDIVKKVVGALG')
+ >>> helical_wheel('KLLKLLKKLLKLLK', colorcoding='charge')
+ >>> helical_wheel('AKLWLKAGRGFGRG', colorcoding='none', lineweights=False)
+ >>> helical_wheel('ACDEFGHIKLMNPQRSTVWY')
+
+ .. image:: ../docs/static/wheel1.png
+ :height: 300px
+ .. image:: ../docs/static/wheel2.png
+ :height: 300px
+ .. image:: ../docs/static/wheel3.png
+ :height: 300px
+ .. image:: ../docs/static/wheel4.png
+ :height: 300px
+
+ .. versionadded:: v2.1.5
+ """
+ # color mappings
+ aa = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
+ if colorcoding == type(str):
+ f_rainbow = ['#3e3e28', '#ffcc33', '#b30047', '#b30047', '#ffcc33', '#3e3e28', '#80d4ff', '#ffcc33', '#0047b3',
+ '#ffcc33', '#ffcc33', '#b366ff', '#29a329', '#b366ff', '#0047b3', '#ff66cc', '#ff66cc', '#ffcc33',
+ '#ffcc33', '#ffcc33']
+ f_charge = ['#000000', '#000000', '#ff4d94', '#ff4d94', '#000000', '#000000', '#80d4ff', '#000000', '#80d4ff',
+ '#000000', '#000000', '#000000', '#000000', '#000000', '#80d4ff', '#000000', '#000000', '#000000',
+ '#000000', '#000000']
+ f_polar = ['#000000', '#000000', '#80d4ff', '#80d4ff', '#000000', '#000000', '#80d4ff', '#000000', '#80d4ff',
+ '#000000', '#000000', '#80d4ff', '#000000', '#80d4ff', '#80d4ff', '#80d4ff', '#80d4ff', '#000000',
+ '#000000', '#000000']
+ f_simple = ['#ffcc33', '#ffcc33', '#0047b3', '#0047b3', '#ffcc33', '#7f7f7f', '#0047b3', '#ffcc33', '#0047b3',
+ '#ffcc33', '#ffcc33', '#0047b3', '#ffcc33', '#0047b3', '#0047b3', '#0047b3', '#0047b3', '#ffcc33',
+ '#ffcc33', '#ffcc33']
+ f_none = ['#ffffff'] * 20
+ f_amphi = ['#ffcc33', '#29a329', '#b30047', '#b30047', '#f79318', '#80d4ff', '#0047b3', '#ffcc33', '#0047b3',
+ '#ffcc33', '#ffcc33', '#80d4ff', '#29a329', '#80d4ff', '#0047b3', '#80d4ff', '#80d4ff', '#ffcc33',
+ '#f79318', '#f79318']
+ t_rainbow = ['w', 'k', 'w', 'w', 'k', 'w', 'k', 'k', 'w', 'k', 'k', 'k', 'k', 'k', 'w', 'k', 'k', 'k', 'k', 'k']
+ t_charge = ['w', 'w', 'k', 'k', 'w', 'w', 'k', 'w', 'k', 'w', 'w', 'w', 'w', 'w', 'k', 'w', 'w', 'w', 'w', 'w']
+ t_polar = ['w', 'w', 'k', 'k', 'w', 'w', 'k', 'w', 'k', 'w', 'w', 'k', 'w', 'k', 'k', 'k', 'k', 'w', 'w', 'w']
+ t_simple = ['k', 'k', 'w', 'w', 'k', 'w', 'w', 'k', 'w', 'k', 'k', 'k', 'k', 'w', 'w', 'w', 'w', 'k', 'k', 'k']
+ t_none = ['k'] * 20
+ t_amphi = ['k', 'k', 'w', 'w', 'w', 'k', 'w', 'k', 'w', 'k', 'k', 'k', 'w', 'k', 'w', 'k', 'k', 'k', 'w', 'w']
+ d_eisberg = load_scale('eisenberg')[1] # eisenberg hydrophobicity values for HM
+ else:
+ f_custom = colorcoding
+ t_custom = text_color
+ d_eisberg = load_scale('eisenberg')[1]
+
+ if lineweights:
+ lw = np.arange(0.1, 5.5, 5. / (len(sequence) - 1)) # line thickness array
+ lw = lw[::-1] # inverse order
+ else:
+ lw = [2.] * (len(sequence) - 1)
+ # check which color coding to use
+ if colorcoding == type(str):
+ if colorcoding == 'rainbow':
+ df = dict(zip(aa, f_rainbow))
+ dt = dict(zip(aa, t_rainbow))
+ elif colorcoding == 'charge':
+ df = dict(zip(aa, f_charge))
+ dt = dict(zip(aa, t_charge))
+ elif colorcoding == 'polar':
+ df = dict(zip(aa, f_polar))
+ dt = dict(zip(aa, t_polar))
+ elif colorcoding == 'simple':
+ df = dict(zip(aa, f_simple))
+ dt = dict(zip(aa, t_simple))
+ elif colorcoding == 'none':
+ df = dict(zip(aa, f_none))
+ dt = dict(zip(aa, t_none))
+ elif colorcoding == 'amphipathic':
+ df = dict(zip(aa, f_amphi))
+ dt = dict(zip(aa, t_amphi))
+ else:
+ print("Unknown color coding, 'rainbow' used instead")
+ df = dict(zip(aa, f_rainbow))
+ dt = dict(zip(aa, t_rainbow))
+ else:
+ df = dict(zip(aa, f_custom))
+ dt = dict(zip(aa, t_custom))
+
+ # degree to radian
+ deg = np.arange(float(len(sequence))) * -100.
+ deg = [d + rot for d in deg] # start at 270 degree in unit circle (on top)
+ rad = np.radians(deg)
+
+ # dict for coordinates and eisenberg values
+ d_hydro = dict(zip(rad, [0.] * len(rad)))
+
+ # create figure
+ fig = plt.figure(frameon=False, figsize=(10, 10))
+ ax = fig.add_subplot(111)
+ old = None
+ hm = list()
+
+ # iterate over sequence
+ for i, r in enumerate(rad):
+ new = (np.cos(r), np.sin(r)) # new AA coordinates
+ if i < 18:
+ # plot the connecting lines
+ if old is not None:
+ line = lines.Line2D((old[0], new[0]), (old[1], new[1]), transform=ax.transData, color='k',
+ linewidth=lw[i - 1])
+ line.set_zorder(1) # 1 = level behind circles
+ ax.add_line(line)
+ elif 17 < i < 36:
+ line = lines.Line2D((old[0], new[0]), (old[1], new[1]), transform=ax.transData, color='k',
+ linewidth=lw[i - 1])
+ line.set_zorder(1) # 1 = level behind circles
+ ax.add_line(line)
+ new = (np.cos(r) * 1.2, np.sin(r) * 1.2)
+ elif i == 36:
+ line = lines.Line2D((old[0], new[0]), (old[1], new[1]), transform=ax.transData, color='k',
+ linewidth=lw[i - 1])
+ line.set_zorder(1) # 1 = level behind circles
+ ax.add_line(line)
+ new = (np.cos(r) * 1.4, np.sin(r) * 1.4)
+ else:
+ new = (np.cos(r) * 1.4, np.sin(r) * 1.4)
+
+ # plot circles
+ circ = patches.Circle(new, radius=0.125, transform=ax.transData, edgecolor='k', facecolor=df[sequence[i]])
+ circ.set_zorder(2) # level in front of lines
+ ax.add_patch(circ)
+
+ # check if N- or C-terminus and add subscript, then plot AA letter
+ if numbering:
+ size = t_size
+ if i == 0:
+ ax.text(new[0], new[1], sequence[i] + '$_N$', va='center', ha='center', transform=ax.transData,
+ size=size, color=dt[sequence[i]], fontweight='bold')
+ elif i == len(sequence) - 1:
+ ax.text(new[0], new[1], sequence[i] + '$_C$', va='center', ha='center', transform=ax.transData,
+ size=size, color=dt[sequence[i]], fontweight='bold')
+ else:
+ seqRange += 1
+ ax.text(new[0], new[1], sequence[i] + '$_{'+str(seqRange)+'}$', va='center', ha='center', transform=ax.transData,
+ size=size, color=dt[sequence[i]], fontweight='bold')
+
+ eb = d_eisberg[sequence[i]][0] # eisenberg value for this AA
+ hm.append([eb * new[0], eb * new[1]]) # save eisenberg hydrophobicity vector value to later calculate HM
+
+ old = (np.cos(r), np.sin(r)) # save as previous coordinates
+
+ else:
+ size = t_size
+ if i == 0:
+ ax.text(new[0], new[1], sequence[i] + '$_N$', va='center', ha='center', transform=ax.transData,
+ size=size, color=dt[sequence[i]], fontweight='bold')
+ elif i == len(sequence) - 1:
+ ax.text(new[0], new[1], sequence[i] + '$_C$', va='center', ha='center', transform=ax.transData,
+ size=size, color=dt[sequence[i]], fontweight='bold')
+ else:
+ ax.text(new[0], new[1], sequence[i], va='center', ha='center', transform=ax.transData,
+ size=size, color=dt[sequence[i]], fontweight='bold')
+
+ eb = d_eisberg[sequence[i]][0] # eisenberg value for this AA
+ hm.append([eb * new[0], eb * new[1]]) # save eisenberg hydrophobicity vector value to later calculate HM
+
+ old = (np.cos(r), np.sin(r)) # save as previous coordinates
+
+ # draw hydrophobic moment arrow if moment option
+ if moment:
+ v_hm = np.sum(np.array(hm), 0)
+ x = .0333 * v_hm[0]
+ y = .0333 * v_hm[1]
+ ax.arrow(0., 0., x, y, head_width=0.04, head_length=0.03, transform=ax.transData,
+ color='k', linewidth=6.)
+ desc = PeptideDescriptor(sequence) # calculate hydrophobic moment
+ desc.calculate_moment()
+ if abs(x) < 0.2 and y > 0.: # right positioning of HM text so arrow does not cover it
+ z = -0.2
+ else:
+ z = 0.2
+ plt.text(0., z, str(round(desc.descriptor[0][0], 3)), fontdict={'fontsize': 20, 'fontweight': 'bold',
+ 'ha': 'center'})
+
+ # plot shape
+ if len(sequence) < 19:
+ ax.set_xlim(-1.2, 1.2)
+ ax.set_ylim(-1.2, 1.2)
+ else:
+ ax.set_xlim(-1.4, 1.4)
+ ax.set_ylim(-1.4, 1.4)
+ ax.spines['right'].set_visible(False)
+ ax.spines['top'].set_visible(False)
+ ax.spines['left'].set_visible(False)
+ ax.spines['bottom'].set_visible(False)
+ cur_axes = plt.gca()
+ cur_axes.axes.get_xaxis().set_visible(False)
+ cur_axes.axes.get_yaxis().set_visible(False)
+ plt.tight_layout()
+
+ if seq:
+ plt.title(sequence, fontweight='bold', fontsize=20)
+
+ # show or save plot
+ if filename:
+ plt.savefig(filename, dpi=dpi)
+ else:
+ plt.show()