annotate spring_minz.py @ 17:c790d25086dc draft

"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
author guerler
date Wed, 28 Oct 2020 05:11:56 +0000
parents 16eb2acaaa20
children 5feab7f00f02
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
1 #! /usr/bin/env python3
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
2 import argparse
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
3 import os
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
4
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
5 def main(args):
17
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
6 inputs = set()
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
7 with open(args.inputlist) as file:
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
8 for index, line in enumerate(file):
17
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
9 name = line.strip()
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
10 inputs.add(name)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
11 print ("Loaded %s input names from `%s`." % (len(inputs), args.inputlist))
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
12 targets = set()
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
13 duplicates = 0
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
14 with open(args.targetlist) as file:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
15 for index, line in enumerate(file):
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
16 name = line.strip()
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
17 targets.add(name)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
18 if name in inputs:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
19 duplicates = duplicates + 1
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
20 print ("Loaded %s target names from `%s`." % (len(targets), args.targetlist))
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
21 crossReference = dict()
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
22 with open(args.crossreference) as file:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
23 for index, line in enumerate(file):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
24 columns = line.split()
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
25 core = columns[0]
16
16eb2acaaa20 "planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents: 15
diff changeset
26 partner = columns[-1]
17
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
27 if core not in crossReference:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
28 crossReference[core] = []
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
29 crossReference[core].append(partner)
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
30 print ("Loaded cross reference from `%s`." % args.crossreference)
17
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
31 interactions = dict()
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
32 for targetName in targets:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
33 targetDirectory = args.targetpath.rstrip("/")
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
34 targetFile = "%s/%s" % (targetDirectory, targetName)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
35 matchScores(targetFile=targetFile,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
36 targetName=targetName,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
37 inputs=inputs,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
38 inputPath=args.inputpath,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
39 crossReference=crossReference,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
40 minScore=args.minscore,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
41 idLength=args.idx,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
42 interactions=interactions)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
43 if duplicates == len(targets):
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
44 for inputName in inputs:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
45 inputDirectory = args.inputpath.rstrip("/")
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
46 inputFile = "%s/%s" % (inputDirectory, inputName)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
47 matchScores(targetFile=inputFile,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
48 targetName=inputName,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
49 inputs=targets,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
50 inputPath=args.targetpath,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
51 crossReference=crossReference,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
52 minScore=args.minscore,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
53 idLength=args.idx,
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
54 interactions=interactions)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
55 interactions = sorted(interactions.values(), key=lambda item: item["minZ"], reverse=True)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
56 with open(args.output, 'w') as output_file:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
57 for entry in interactions:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
58 output_file.write("%s\t%s\t%s\t%s\n" % (entry["targetName"], entry["inputName"], entry["minZ"], entry["minInfo"]))
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
59
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
60 def matchScores(targetFile, targetName, inputs, inputPath, crossReference, minScore, idLength, interactions):
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
61 targetTop, targetHits = getTemplateScores(targetFile, minScore, idLength)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
62 if not targetHits:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
63 print("No targets found `%s`" % targetFile)
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
64 else:
17
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
65 print ("Loaded target scores from `%s`." % targetFile)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
66 for inputName in inputs:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
67 inputDirectory = inputPath.rstrip("/")
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
68 inputFile = "%s/%s" % (inputDirectory, inputName)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
69 inputTop, inputHits = getTemplateScores(inputFile, minScore, idLength)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
70 minZ = 0
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
71 minInfo = ""
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
72 for t in targetHits:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
73 if t in crossReference:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
74 partners = crossReference[t]
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
75 for p in partners:
17
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
76 if p in inputHits:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
77 score = min(targetHits[t], inputHits[p])
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
78 if score > minZ:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
79 minZ = score
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
80 minInfo = "%s\t%s\t%s\t%s" % (targetTop, inputTop, t, p)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
81 if minZ > minScore:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
82 if targetName > inputName:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
83 interactionKey = "%s_%s_%s" % (targetName, inputName, minZ)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
84 else:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
85 interactionKey = "%s_%s_%s" % (inputName, targetName, minZ)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
86 if interactionKey not in interactions:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
87 interactions[interactionKey] = dict(targetName=targetName, inputName=inputName, minZ=minZ, minInfo=minInfo)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
88 print("Predicting: %s, min-Z: %s, templates: %s" % (inputName, minZ, minInfo))
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
89 return interactions
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
90
17
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
91 def getTemplateScores(hhrFile, minScore, identifierLength):
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
92 result = dict()
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
93 topTemplate = None
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
94 identifierLength = identifierLength + 4
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
95 if os.path.isfile(hhrFile):
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
96 with open(hhrFile) as file:
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
97 for index, line in enumerate(file):
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
98 if index > 8:
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
99 if not line.strip():
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
100 break
17
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
101 templateId = line[4:identifierLength]
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
102 templateScore = float(line[57:63])
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
103 if templateScore > minScore:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
104 if topTemplate is None:
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
105 topTemplate = templateId
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
106 result[templateId] = templateScore
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
107 return topTemplate, result
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
108
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
109 if __name__ == "__main__":
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
110 parser = argparse.ArgumentParser(description='This script identifies interactions by detecting matching HH-search results.')
17
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
111 parser.add_argument('-il', '--inputlist', help='Text file containing identifiers.', required=True)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
112 parser.add_argument('-ip', '--inputpath', help='Directory containing `hhr` files', required=True)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
113 parser.add_argument('-tl', '--targetlist', help='Text file containing identifiers.', required=True)
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
114 parser.add_argument('-tp', '--targetpath', help='Directory containing `hhr` files', required=True)
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
115 parser.add_argument('-c', '--crossreference', help='Cross Reference index file', required=True)
9
4ac5d5a9b21c "planemo upload commit f465445ebca0307953f59938494a2244ca8ea23c"
guerler
parents: 8
diff changeset
116 parser.add_argument('-x', '--idx', help='Length of identifier', type=int, default=6)
17
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
117 parser.add_argument('-o', '--output', help='Output file containing min-Z scores', required=True)
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
118 parser.add_argument('-m', '--minscore', help='min-Z score threshold', type=int, default=10)
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
119 args = parser.parse_args()
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
120 main(args)