annotate local_AAComposition.py @ 35:a662eb3f87c2 draft

Uploaded
author jose_duarte
date Tue, 13 Jun 2023 09:53:42 +0000
parents 52e50de4c005
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
26
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
1
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
2 # -*- coding: utf-8 -*-
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
3 """
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
4 The module is used for computing the composition of amino acids, dipetide and
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
5 3-mers (tri-peptide) for a given protein sequence.
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
6 References
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
7 ----------
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
8 .. [1] Reczko, M. and Bohr, H. (1994) The DEF data base of sequence based protein
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
9 fold class predictions. Nucleic Acids Res, 22, 3616-3619.
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
10 .. [2] Hua, S. and Sun, Z. (2001) Support vector machine approach for protein
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
11 subcellular localization prediction. Bioinformatics, 17, 721-728.
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
12 .. [3] Grassmann, J., Reczko, M., Suhai, S. and Edler, L. (1999) Protein fold
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
13 class prediction: new methods of statistical classification. Proc Int Conf
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
14 Intell Syst Mol Biol, 106-112.
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
15 Authors: Dongsheng Cao and Yizeng Liang.
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
16 Date: 2012.3.27
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
17 Email: oriental-cds@163.com
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
18 """
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
19
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
20 # Core Library
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
21 import re
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
22 from typing import Any, Dict, List
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
23
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
24 AALetter: List[str] = list("ARNDCEQGHILKMFPSTWYV")
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
25
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
26 ProteinSequence_docstring = """ProteinSequence: str
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
27 a pure protein sequence"""
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
28
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
29
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
30 def CalculateAAComposition(ProteinSequence: str) -> Dict[str, float]:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
31 sequence_length = len(ProteinSequence)
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
32 result: Dict[str, float] = {}
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
33 for i in AALetter:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
34 result[i] = round(float(ProteinSequence.count(i)) / sequence_length * 100, 3)
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
35 return result
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
36
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
37
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
38 def CalculateDipeptideComposition(ProteinSequence: str) -> Dict[str, float]:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
39 sequence_length = len(ProteinSequence)
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
40 result = {}
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
41 for i in AALetter:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
42 for j in AALetter:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
43 dipeptide = i + j
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
44 result[dipeptide] = round(
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
45 float(ProteinSequence.count(dipeptide)) / (sequence_length - 1) * 100, 2
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
46 )
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
47 return result
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
48
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
49
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
50 def Getkmers() -> List[str]:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
51 kmers = []
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
52 for i in AALetter:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
53 for j in AALetter:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
54 for k in AALetter:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
55 kmers.append(i + j + k)
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
56 return kmers
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
57
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
58
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
59 def GetSpectrumDict(proteinsequence: str) -> Dict[str, int]:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
60 result = {}
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
61 kmers = Getkmers()
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
62 for i in kmers:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
63 result[i] = len(re.findall(i, proteinsequence))
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
64 return result
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
65
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
66
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
67 def CalculateAADipeptideComposition(ProteinSequence: str) -> Dict[str, float]:
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
68 result: Dict[Any, Any] = {}
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
69 result.update(CalculateAAComposition(ProteinSequence))
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
70 result.update(CalculateDipeptideComposition(ProteinSequence))
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
71 result.update(GetSpectrumDict(ProteinSequence))
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
72 return result
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
73
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
74
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
75
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
76
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
77
52e50de4c005 Uploaded
jose_duarte
parents:
diff changeset
78