annotate spring_minz.py @ 16:16eb2acaaa20 draft

"planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
author guerler
date Sat, 24 Oct 2020 17:48:06 +0000
parents 4a4888bf0338
children c790d25086dc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
1 #! /usr/bin/env python3
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
2 import argparse
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
3 import os
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
4
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
5 def main(args):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
6 names = []
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
7 with open(args.list) as file:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
8 for index, line in enumerate(file):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
9 names.append(line.strip())
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
10 print ("Loaded %s names from `%s`." % (len(names), args.list))
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
11 crossreference = {}
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
12 with open(args.crossreference) as file:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
13 for index, line in enumerate(file):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
14 columns = line.split()
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
15 core = columns[0]
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
16 partner = columns[-1]
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
17 if core not in crossreference:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
18 crossreference[core] = []
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
19 crossreference[core].append(partner)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
20 print ("Loaded cross reference from `%s`." % args.crossreference)
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
21 toptarget, targets = get_template_scores(args.target, args.minscore, args.idx)
8
f2f38991c36f "planemo upload commit ab5e686c62d07a0d45216b474d435b855745222d-dirty"
guerler
parents: 0
diff changeset
22 interactions = []
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
23 if not targets:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
24 print("No targets found `%s`" % args.target)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
25 else:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
26 print ("Loaded target scores from `%s`." % args.target)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
27 for name in names:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
28 input_directory = args.inputs.rstrip("/")
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
29 input_file = "%s/%s" % (input_directory, name)
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
30 toptemplate, templates = get_template_scores(input_file, args.minscore, args.idx)
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
31 minz = 0
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
32 mint = ""
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
33 for t in targets:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
34 if t in crossreference:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
35 partners = crossreference[t]
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
36 for p in partners:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
37 if p in templates:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
38 score = min(targets[t], templates[p])
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
39 if score > minz:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
40 minz = score
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
41 mint = "%s\t%s\t%s\t%s" % (toptarget, toptemplate, t, p)
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
42 if minz > args.minscore:
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
43 interactions.append((name, minz, mint))
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
44 print("Predicting: %s, min-Z: %s, templates: %s" % (name, minz, mint))
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
45 interactions.sort(key=lambda tup: tup[1], reverse=True)
11
21a7dd67b483 "planemo upload commit f208fb81425711754738fc8e612ee2d5cc26377f"
guerler
parents: 9
diff changeset
46 with open(args.output, 'a+') as output_file:
8
f2f38991c36f "planemo upload commit ab5e686c62d07a0d45216b474d435b855745222d-dirty"
guerler
parents: 0
diff changeset
47 for i in interactions:
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
48 output_file.write("%s\t%s\t%s\t%s\n" % (args.name, i[0], i[1], i[2]))
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
49
9
4ac5d5a9b21c "planemo upload commit f465445ebca0307953f59938494a2244ca8ea23c"
guerler
parents: 8
diff changeset
50 def get_template_scores(hhr_file, min_score, identifier_length):
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
51 result = {}
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
52 toptemplate = None
9
4ac5d5a9b21c "planemo upload commit f465445ebca0307953f59938494a2244ca8ea23c"
guerler
parents: 8
diff changeset
53 identifier_length = identifier_length + 4
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
54 if os.path.isfile(hhr_file):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
55 with open(hhr_file) as file:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
56 for index, line in enumerate(file):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
57 if index > 8:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
58 if not line.strip():
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
59 break
9
4ac5d5a9b21c "planemo upload commit f465445ebca0307953f59938494a2244ca8ea23c"
guerler
parents: 8
diff changeset
60 template_id = line[4:identifier_length]
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
61 template_score = float(line[57:63])
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
62 if template_score > min_score:
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
63 if toptemplate is None:
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
64 toptemplate = template_id
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
65 result[template_id] = template_score
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
66 return toptemplate, result
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
67
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
68 if __name__ == "__main__":
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
69 parser = argparse.ArgumentParser(description='This script identifies interactions by detecting matching HH-search results.')
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
70 parser.add_argument('-t', '--target', help='HHR target file result', required=True)
11
21a7dd67b483 "planemo upload commit f208fb81425711754738fc8e612ee2d5cc26377f"
guerler
parents: 9
diff changeset
71 parser.add_argument('-n', '--name', help='HHR target name', required=True)
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
72 parser.add_argument('-c', '--crossreference', help='Cross Reference index file', required=True)
9
4ac5d5a9b21c "planemo upload commit f465445ebca0307953f59938494a2244ca8ea23c"
guerler
parents: 8
diff changeset
73 parser.add_argument('-x', '--idx', help='Length of identifier', type=int, default=6)
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
74 parser.add_argument('-l', '--list', help='Text file containing identifiers.', required=True)
9
4ac5d5a9b21c "planemo upload commit f465445ebca0307953f59938494a2244ca8ea23c"
guerler
parents: 8
diff changeset
75 parser.add_argument('-i', '--inputs', help='Directory containing `hhr` files', required=True)
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
76 parser.add_argument('-o', '--output', help='Output file containing min-Z scores`', required=True)
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
77 parser.add_argument('-m', '--minscore', help='min-Z score threshold', type=int, default=10)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
78 args = parser.parse_args()
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
79 main(args)