Mercurial > repos > guerler > springsuite
diff spring_minz.py @ 23:5469e19f1f96 draft
"planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
author | guerler |
---|---|
date | Thu, 29 Oct 2020 13:04:47 +0000 |
parents | acaff61a09b2 |
children | 5d1ae615e4ec |
line wrap: on
line diff
--- a/spring_minz.py Wed Oct 28 06:49:58 2020 +0000 +++ b/spring_minz.py Thu Oct 29 13:04:47 2020 +0000 @@ -3,120 +3,125 @@ import os def main(args): - inputs = list() - with open(args.inputlist) as file: - for index, line in enumerate(file): - name = line.strip() - inputs.append(name) - print ("Loaded %s input names from `%s`." % (len(inputs), args.inputlist)) - targets = list() - duplicates = 0 - with open(args.targetlist) as file: - for index, line in enumerate(file): - name = line.strip() - targets.append(name) - if name in inputs: - duplicates = duplicates + 1 - print ("Loaded %s target names from `%s`." % (len(targets), args.targetlist)) - crossReference = dict() - with open(args.crossreference) as file: - for index, line in enumerate(file): - columns = line.split() - core = columns[0] - partner = columns[-1] - if core not in crossReference: - crossReference[core] = [] - crossReference[core].append(partner) - print ("Loaded cross reference from `%s`." % args.crossreference) - interactions = dict() - for targetName in targets: - targetDirectory = args.targetpath.rstrip("/") - targetFile = "%s/%s" % (targetDirectory, targetName) - matchScores(targetFile=targetFile, - targetName=targetName, - inputs=sorted(inputs), - inputPath=args.inputpath, - crossReference=crossReference, - minScore=args.minscore, - idLength=args.idx, - interactions=interactions) - if duplicates != len(targets): - for inputName in inputs: - inputDirectory = args.inputpath.rstrip("/") - inputFile = "%s/%s" % (inputDirectory, inputName) - matchScores(targetFile=inputFile, - targetName=inputName, - inputs=targets, - inputPath=args.targetpath, - crossReference=crossReference, - minScore=args.minscore, - idLength=args.idx, - interactions=interactions) - interactions = sorted(interactions.values(), key=lambda item: item["minZ"], reverse=True) - with open(args.output, 'w') as output_file: - for entry in interactions: - output_file.write("%s\t%s\t%s\t%s\n" % (entry["targetName"], entry["inputName"], entry["minZ"], entry["minInfo"])) + logFile = open(args.log, 'a+') + targets = list() + targetPath = args.targetpath.rstrip("/") + with open(args.targetlist) as file: + for index, line in enumerate(file): + name = line.strip() + targets.append(name) + print ("Loaded %s target names from `%s`." % (len(targets), args.targetlist)) + if args.inputlist: + inputs = list() + inputPath = args.inputpath.rstrip("/") + with open(args.inputlist) as file: + for index, line in enumerate(file): + name = line.strip() + inputs.append(name) + print ("Loaded %s input names from `%s`." % (len(inputs), args.inputlist)) + else: + inputs = targets + inputPath = targetPath + crossReference = dict() + with open(args.crossreference) as file: + for index, line in enumerate(file): + columns = line.split() + core = columns[0] + partner = columns[-1] + if core not in crossReference: + crossReference[core] = [] + crossReference[core].append(partner) + print ("Loaded cross reference from `%s`." % args.crossreference) + interactions = dict() + for targetName in targets: + targetFile = "%s/%s" % (targetPath, targetName) + matchScores(targetFile=targetFile, + targetName=targetName, + inputs=inputs, + inputPath=inputPath, + crossReference=crossReference, + minScore=args.minscore, + idLength=args.idx, + logFile=logFile, + interactions=interactions) + if args.inputlist: + for inputName in inputs: + inputDirectory = inputPath + inputFile = "%s/%s" % (inputDirectory, inputName) + matchScores(targetFile=inputFile, + targetName=inputName, + inputs=targets, + inputPath=targetPath, + crossReference=crossReference, + minScore=args.minscore, + idLength=args.idx, + logFile=logFile, + interactions=interactions) + interactions = sorted(interactions.values(), key=lambda item: item["minZ"], reverse=True) + with open(args.output, 'w') as output_file: + for entry in interactions: + output_file.write("%s\t%s\t%s\t%s\n" % (entry["targetName"], entry["inputName"], entry["minZ"], entry["minInfo"])) + logFile.close() -def matchScores(targetFile, targetName, inputs, inputPath, crossReference, minScore, idLength, interactions): - targetTop, targetHits = getTemplateScores(targetFile, minScore, idLength) - if not targetHits: - print("No targets found `%s`" % targetFile) - else: - print ("Loaded target scores from `%s`." % targetFile) - for inputName in inputs: - inputDirectory = inputPath.rstrip("/") - inputFile = "%s/%s" % (inputDirectory, inputName) - inputTop, inputHits = getTemplateScores(inputFile, minScore, idLength) - minZ = 0 - minInfo = "" - for t in targetHits: - if t in crossReference: - partners = crossReference[t] - for p in partners: - if p in inputHits: - score = min(targetHits[t], inputHits[p]) - if score > minZ: - minZ = score - minInfo = "%s\t%s\t%s\t%s" % (targetTop, inputTop, t, p) - if minZ > minScore: - if targetName > inputName: - interactionKey = "%s_%s" % (targetName, inputName) - else: - interactionKey = "%s_%s" % (inputName, targetName) - if interactionKey in interactions: - if interactions[interactionKey]["minZ"] >= minZ: - continue - interactions[interactionKey] = dict(targetName=targetName, inputName=inputName, minZ=minZ, minInfo=minInfo) - print("Predicting: %s, min-Z: %s, templates: %s" % (inputName, minZ, minInfo)) - return interactions +def matchScores(targetFile, targetName, inputs, inputPath, crossReference, minScore, idLength, logFile, interactions): + targetTop, targetHits = getTemplateScores(targetFile, minScore, idLength) + if not targetHits: + print("No targets found `%s`" % targetFile) + else: + print ("Loaded target scores from `%s`." % targetFile) + for inputName in inputs: + inputFile = "%s/%s" % (inputPath, inputName) + inputTop, inputHits = getTemplateScores(inputFile, minScore, idLength) + minZ = 0 + minInfo = "" + for t in targetHits: + if t in crossReference: + partners = crossReference[t] + for p in partners: + if p in inputHits: + score = min(targetHits[t], inputHits[p]) + if score > minZ: + minZ = score + minInfo = "%s\t%s\t%s\t%s" % (targetTop, inputTop, t, p) + if minZ > minScore: + if targetName > inputName: + interactionKey = "%s_%s" % (targetName, inputName) + else: + interactionKey = "%s_%s" % (inputName, targetName) + if interactionKey in interactions: + if interactions[interactionKey]["minZ"] >= minZ: + continue + interactions[interactionKey] = dict(targetName=targetName, inputName=inputName, minZ=minZ, minInfo=minInfo) + logFile.write("Interaction between %s and %s [min-Z: %s].\n" % (targetName, inputName, minZ)) def getTemplateScores(hhrFile, minScore, identifierLength): - result = dict() - topTemplate = None - identifierLength = identifierLength + 4 - if os.path.isfile(hhrFile): - with open(hhrFile) as file: - for index, line in enumerate(file): - if index > 8: - if not line.strip(): - break - templateId = line[4:identifierLength] - templateScore = float(line[57:63]) - if templateScore > minScore: - if topTemplate is None: - topTemplate = templateId - result[templateId] = templateScore - return topTemplate, result + result = dict() + topTemplate = None + identifierLength = identifierLength + 4 + if os.path.isfile(hhrFile): + with open(hhrFile) as file: + for index, line in enumerate(file): + if index > 8: + if not line.strip(): + break + templateId = line[4:identifierLength] + templateScore = float(line[57:63]) + if templateScore > minScore: + if topTemplate is None: + topTemplate = templateId + result[templateId] = templateScore + return topTemplate, result if __name__ == "__main__": - parser = argparse.ArgumentParser(description='This script identifies interactions by detecting matching HH-search results.') - parser.add_argument('-il', '--inputlist', help='Text file containing identifiers.', required=True) - parser.add_argument('-ip', '--inputpath', help='Directory containing `hhr` files', required=True) - parser.add_argument('-tl', '--targetlist', help='Text file containing identifiers.', required=True) - parser.add_argument('-tp', '--targetpath', help='Directory containing `hhr` files', required=True) - parser.add_argument('-c', '--crossreference', help='Cross Reference index file', required=True) - parser.add_argument('-x', '--idx', help='Length of identifier', type=int, default=6) - parser.add_argument('-o', '--output', help='Output file containing min-Z scores', required=True) - parser.add_argument('-m', '--minscore', help='min-Z score threshold', type=int, default=10) - args = parser.parse_args() - main(args) \ No newline at end of file + parser = argparse.ArgumentParser(description='This script identifies interactions by detecting matching HH-search results.') + parser.add_argument('-tl', '--targetlist', help='Text file containing identifiers.', required=True) + parser.add_argument('-tp', '--targetpath', help='Directory containing `hhr` files', required=True) + parser.add_argument('-il', '--inputlist', help='Text file containing identifiers.', required=False) + parser.add_argument('-ip', '--inputpath', help='Directory containing `hhr` files', required=False) + parser.add_argument('-c', '--crossreference', help='Cross Reference index file', required=True) + parser.add_argument('-x', '--idx', help='Length of identifier', type=int, default=6) + parser.add_argument('-o', '--output', help='Output file containing min-Z scores', required=True) + parser.add_argument('-l', '--log', help='Log file', required=True) + parser.add_argument('-m', '--minscore', help='min-Z score threshold', type=int, default=10) + args = parser.parse_args() + main(args) \ No newline at end of file