Mercurial > repos > guerler > springsuite
annotate spring_minz.py @ 19:dfd972ba93a4 draft
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
author | guerler |
---|---|
date | Wed, 28 Oct 2020 05:15:42 +0000 |
parents | c790d25086dc |
children | 5feab7f00f02 |
rev | line source |
---|---|
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
1 #! /usr/bin/env python3 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
2 import argparse |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
3 import os |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
4 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
5 def main(args): |
17
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
6 inputs = set() |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
7 with open(args.inputlist) as file: |
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
8 for index, line in enumerate(file): |
17
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
9 name = line.strip() |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
10 inputs.add(name) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
11 print ("Loaded %s input names from `%s`." % (len(inputs), args.inputlist)) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
12 targets = set() |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
13 duplicates = 0 |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
14 with open(args.targetlist) as file: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
15 for index, line in enumerate(file): |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
16 name = line.strip() |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
17 targets.add(name) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
18 if name in inputs: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
19 duplicates = duplicates + 1 |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
20 print ("Loaded %s target names from `%s`." % (len(targets), args.targetlist)) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
21 crossReference = dict() |
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
22 with open(args.crossreference) as file: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
23 for index, line in enumerate(file): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
24 columns = line.split() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
25 core = columns[0] |
16
16eb2acaaa20
"planemo upload commit 0e4e1f8de9464b411152c44f4edd099db8ad9e0b"
guerler
parents:
15
diff
changeset
|
26 partner = columns[-1] |
17
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
27 if core not in crossReference: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
28 crossReference[core] = [] |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
29 crossReference[core].append(partner) |
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
30 print ("Loaded cross reference from `%s`." % args.crossreference) |
17
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
31 interactions = dict() |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
32 for targetName in targets: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
33 targetDirectory = args.targetpath.rstrip("/") |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
34 targetFile = "%s/%s" % (targetDirectory, targetName) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
35 matchScores(targetFile=targetFile, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
36 targetName=targetName, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
37 inputs=inputs, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
38 inputPath=args.inputpath, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
39 crossReference=crossReference, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
40 minScore=args.minscore, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
41 idLength=args.idx, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
42 interactions=interactions) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
43 if duplicates == len(targets): |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
44 for inputName in inputs: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
45 inputDirectory = args.inputpath.rstrip("/") |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
46 inputFile = "%s/%s" % (inputDirectory, inputName) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
47 matchScores(targetFile=inputFile, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
48 targetName=inputName, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
49 inputs=targets, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
50 inputPath=args.targetpath, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
51 crossReference=crossReference, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
52 minScore=args.minscore, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
53 idLength=args.idx, |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
54 interactions=interactions) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
55 interactions = sorted(interactions.values(), key=lambda item: item["minZ"], reverse=True) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
56 with open(args.output, 'w') as output_file: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
57 for entry in interactions: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
58 output_file.write("%s\t%s\t%s\t%s\n" % (entry["targetName"], entry["inputName"], entry["minZ"], entry["minInfo"])) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
59 |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
60 def matchScores(targetFile, targetName, inputs, inputPath, crossReference, minScore, idLength, interactions): |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
61 targetTop, targetHits = getTemplateScores(targetFile, minScore, idLength) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
62 if not targetHits: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
63 print("No targets found `%s`" % targetFile) |
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
64 else: |
17
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
65 print ("Loaded target scores from `%s`." % targetFile) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
66 for inputName in inputs: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
67 inputDirectory = inputPath.rstrip("/") |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
68 inputFile = "%s/%s" % (inputDirectory, inputName) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
69 inputTop, inputHits = getTemplateScores(inputFile, minScore, idLength) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
70 minZ = 0 |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
71 minInfo = "" |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
72 for t in targetHits: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
73 if t in crossReference: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
74 partners = crossReference[t] |
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
75 for p in partners: |
17
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
76 if p in inputHits: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
77 score = min(targetHits[t], inputHits[p]) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
78 if score > minZ: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
79 minZ = score |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
80 minInfo = "%s\t%s\t%s\t%s" % (targetTop, inputTop, t, p) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
81 if minZ > minScore: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
82 if targetName > inputName: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
83 interactionKey = "%s_%s_%s" % (targetName, inputName, minZ) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
84 else: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
85 interactionKey = "%s_%s_%s" % (inputName, targetName, minZ) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
86 if interactionKey not in interactions: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
87 interactions[interactionKey] = dict(targetName=targetName, inputName=inputName, minZ=minZ, minInfo=minInfo) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
88 print("Predicting: %s, min-Z: %s, templates: %s" % (inputName, minZ, minInfo)) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
89 return interactions |
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
90 |
17
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
91 def getTemplateScores(hhrFile, minScore, identifierLength): |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
92 result = dict() |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
93 topTemplate = None |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
94 identifierLength = identifierLength + 4 |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
95 if os.path.isfile(hhrFile): |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
96 with open(hhrFile) as file: |
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
97 for index, line in enumerate(file): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
98 if index > 8: |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
99 if not line.strip(): |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
100 break |
17
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
101 templateId = line[4:identifierLength] |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
102 templateScore = float(line[57:63]) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
103 if templateScore > minScore: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
104 if topTemplate is None: |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
105 topTemplate = templateId |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
106 result[templateId] = templateScore |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
107 return topTemplate, result |
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
108 |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
109 if __name__ == "__main__": |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
110 parser = argparse.ArgumentParser(description='This script identifies interactions by detecting matching HH-search results.') |
17
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
111 parser.add_argument('-il', '--inputlist', help='Text file containing identifiers.', required=True) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
112 parser.add_argument('-ip', '--inputpath', help='Directory containing `hhr` files', required=True) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
113 parser.add_argument('-tl', '--targetlist', help='Text file containing identifiers.', required=True) |
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
114 parser.add_argument('-tp', '--targetpath', help='Directory containing `hhr` files', required=True) |
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
115 parser.add_argument('-c', '--crossreference', help='Cross Reference index file', required=True) |
9
4ac5d5a9b21c
"planemo upload commit f465445ebca0307953f59938494a2244ca8ea23c"
guerler
parents:
8
diff
changeset
|
116 parser.add_argument('-x', '--idx', help='Length of identifier', type=int, default=6) |
17
c790d25086dc
"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents:
16
diff
changeset
|
117 parser.add_argument('-o', '--output', help='Output file containing min-Z scores', required=True) |
0
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
118 parser.add_argument('-m', '--minscore', help='min-Z score threshold', type=int, default=10) |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
119 args = parser.parse_args() |
d30785e31577
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff
changeset
|
120 main(args) |