Mercurial > repos > guerler > springsuite
comparison spring_map.py @ 37:0be0af9e695d draft
"planemo upload commit c716195a2cc1ed30ff8c4936621091296a93b2fc"
author | guerler |
---|---|
date | Wed, 25 Nov 2020 14:35:35 +0000 |
parents | |
children | 80a4b98121b6 |
comparison
equal
deleted
inserted
replaced
36:2fe8ffff530d | 37:0be0af9e695d |
---|---|
1 #! /usr/bin/env python3 | |
2 import argparse | |
3 import os | |
4 | |
5 from spring_package.DBKit import createFile | |
6 from spring_package.Molecule import Molecule | |
7 | |
8 | |
9 def getId(line): | |
10 line = line.strip() | |
11 return line[:4].upper() + line[4:6] | |
12 | |
13 | |
14 def getPDB(line, args): | |
15 pdb = line[:4].lower() | |
16 pdbChain = line[5:6] | |
17 pdbFile = "%s/temp.pdb" % args.temp | |
18 pdbDatabaseId = "%s.pdb" % pdb | |
19 createFile(pdbDatabaseId, args.index, args.database, pdbFile) | |
20 return pdbFile, pdbChain | |
21 | |
22 | |
23 def getSequences(fileName): | |
24 sequences = dict() | |
25 with open(fileName) as file: | |
26 for line in file: | |
27 if line.startswith(">"): | |
28 name = getId(line.split()[0][1:]) | |
29 nextLine = next(file) | |
30 sequences[name] = nextLine | |
31 return sequences | |
32 | |
33 | |
34 def getHomologue(queryFile, queryResultFile, databaseFile): | |
35 if not os.path.isfile(queryResultFile): | |
36 os.system("psiblast -query %s -db %s -out %s" % (queryFile, | |
37 databaseFile, queryResultFile)) | |
38 maxMatch = None | |
39 try: | |
40 with open(queryResultFile) as file: | |
41 for i in range(38): | |
42 line = next(file) | |
43 maxMatch = getId(line.split()[0]) | |
44 except Exception: | |
45 return None | |
46 return maxMatch | |
47 | |
48 | |
49 def main(args): | |
50 logFile = open(args.log, "w") | |
51 temp = args.temp.rstrip("/") | |
52 templates = set() | |
53 os.system("mkdir -p %s" % temp) | |
54 templateSequenceFile = "%s/templates.fasta" % temp | |
55 if not os.path.isfile(templateSequenceFile): | |
56 templateSequences = open(templateSequenceFile, "w") | |
57 with open(args.list) as file: | |
58 for rawId in file: | |
59 templateId = getId(rawId) | |
60 templates.add(templateId) | |
61 pdbFile, pdbChain = getPDB(templateId, args) | |
62 try: | |
63 templateMol = Molecule(pdbFile) | |
64 templateSeq = templateMol.getSequence(pdbChain) | |
65 templateSequences.write(">%s\n" % templateId) | |
66 templateSequences.write("%s\n" % templateSeq) | |
67 except Exception: | |
68 logFile.write("Warning: File not found [%s].\n" % pdbFile) | |
69 templateSequences.close() | |
70 os.system("makeblastdb -in %s -dbtype prot" % templateSequenceFile) | |
71 else: | |
72 logFile.write("Using existing sequences for templates [%s].\n" % templateSequenceFile) | |
73 logFile.write("Found %s template entries from `%s`.\n" % (len(templates), args.list)) | |
74 logFile.flush() | |
75 | |
76 crossReference = list() | |
77 with open(args.cross) as file: | |
78 for line in file: | |
79 cols = line.split() | |
80 if len(cols) != 2: | |
81 raise Exception("Invalid line in crossreference [%s]." % line) | |
82 crossReference.append(dict(core=cols[0], partner=cols[1])) | |
83 logFile.write("Loaded crossreference with %d entries.\n" % len(crossReference)) | |
84 logFile.flush() | |
85 | |
86 for refEntry in crossReference: | |
87 coreId = refEntry["core"] | |
88 partnerId = refEntry["partner"] | |
89 partnerFile = "%s/%s.fasta" % (temp, partnerId) | |
90 partnerResultFile = "%s.result" % partnerFile | |
91 if partnerId in templates: | |
92 refEntry["match"] = partnerId | |
93 logFile.write("Found partner in template list alignment [%s].\n" % partnerId) | |
94 else: | |
95 logFile.write("Processing %s.\n" % partnerId) | |
96 if not os.path.isfile(partnerResultFile): | |
97 pdbFile, pdbChain = getPDB(partnerId, args) | |
98 partnerMol = Molecule(pdbFile) | |
99 partnerSeq = partnerMol.getSequence(pdbChain) | |
100 with open(partnerFile, "w") as partnerFasta: | |
101 partnerFasta.write(">%s\n" % partnerId) | |
102 partnerFasta.write("%s" % partnerSeq) | |
103 else: | |
104 logFile.write("Using existing results. [%s].\n" % partnerId) | |
105 matchedId = getHomologue(partnerFile, partnerResultFile, | |
106 templateSequenceFile) | |
107 if matchedId is None: | |
108 logFile.write("Warning: Failed alignment [%s].\n" % partnerId) | |
109 else: | |
110 logFile.write("Found matching entry %s.\n" % matchedId) | |
111 refEntry["match"] = matchedId | |
112 logFile.flush() | |
113 | |
114 finalSet = set() | |
115 for refEntry in crossReference: | |
116 coreId = refEntry["core"] | |
117 partnerId = refEntry["partner"] | |
118 if "match" in refEntry: | |
119 entry = "%s\t%s" % (coreId, refEntry["match"]) | |
120 if entry not in finalSet: | |
121 finalSet.add(entry) | |
122 else: | |
123 logFile.write("Warning: Skipping failed missing partner match [%s, %s].\n" % (coreId, partnerId)) | |
124 logFile.write("Found %s cross reference entries.\n" % len(finalSet)) | |
125 logFile.close() | |
126 | |
127 with open(args.output, 'w') as output_file: | |
128 for entry in sorted(finalSet): | |
129 output_file.write("%s\n" % entry) | |
130 | |
131 | |
132 if __name__ == "__main__": | |
133 parser = argparse.ArgumentParser(description='Maps binding partners to template library') | |
134 parser.add_argument('-l', '--list', help='List of template entries [PDB_CHAIN]', required=True) | |
135 parser.add_argument('-i', '--index', help='PDB Database Index file (dbkit_index)', required=True) | |
136 parser.add_argument('-d', '--database', help='PDB Database files (dbkit)', required=True) | |
137 parser.add_argument('-c', '--cross', help='Cross reference (unmapped)', required=True) | |
138 parser.add_argument('-o', '--output', help='Cross reference', required=True) | |
139 parser.add_argument('-g', '--log', help='Log File', required=True) | |
140 parser.add_argument('-t', '--temp', help='Temporary directory', required=True) | |
141 args = parser.parse_args() | |
142 main(args) |