diff spring_minz.py @ 23:5469e19f1f96 draft

"planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
author guerler
date Thu, 29 Oct 2020 13:04:47 +0000
parents acaff61a09b2
children 5d1ae615e4ec
line wrap: on
line diff
--- a/spring_minz.py	Wed Oct 28 06:49:58 2020 +0000
+++ b/spring_minz.py	Thu Oct 29 13:04:47 2020 +0000
@@ -3,120 +3,125 @@
 import os
 
 def main(args):
-	inputs = list()
-	with open(args.inputlist) as file:
-		for index, line in enumerate(file):
-			name = line.strip()
-			inputs.append(name)
-	print ("Loaded %s input names from `%s`." % (len(inputs), args.inputlist))
-	targets = list()
-	duplicates = 0
-	with open(args.targetlist) as file:
-		for index, line in enumerate(file):
-			name = line.strip()
-			targets.append(name)
-			if name in inputs:
-				duplicates = duplicates + 1
-	print ("Loaded %s target names from `%s`." % (len(targets), args.targetlist))
-	crossReference = dict()
-	with open(args.crossreference) as file:
-		for index, line in enumerate(file):
-			columns = line.split()
-			core = columns[0]
-			partner = columns[-1]
-			if core not in crossReference:
-				crossReference[core] = []
-			crossReference[core].append(partner)
-	print ("Loaded cross reference from `%s`." % args.crossreference)
-	interactions = dict()
-	for targetName in targets:
-		targetDirectory = args.targetpath.rstrip("/")
-		targetFile = "%s/%s" % (targetDirectory, targetName)
-		matchScores(targetFile=targetFile,
-					targetName=targetName,
-					inputs=sorted(inputs),
-					inputPath=args.inputpath,
-					crossReference=crossReference,
-					minScore=args.minscore,
-					idLength=args.idx,
-					interactions=interactions)
-	if duplicates != len(targets):
-		for inputName in inputs:
-			inputDirectory = args.inputpath.rstrip("/")
-			inputFile = "%s/%s" % (inputDirectory, inputName)
-			matchScores(targetFile=inputFile,
-						targetName=inputName,
-						inputs=targets,
-						inputPath=args.targetpath,
-						crossReference=crossReference,
-						minScore=args.minscore,
-						idLength=args.idx,
-						interactions=interactions)
-	interactions = sorted(interactions.values(), key=lambda item: item["minZ"], reverse=True)
-	with open(args.output, 'w') as output_file:
-		for entry in interactions:
-			output_file.write("%s\t%s\t%s\t%s\n" % (entry["targetName"], entry["inputName"], entry["minZ"], entry["minInfo"]))
+    logFile = open(args.log, 'a+')
+    targets = list()
+    targetPath = args.targetpath.rstrip("/")
+    with open(args.targetlist) as file:
+        for index, line in enumerate(file):
+            name = line.strip()
+            targets.append(name)
+    print ("Loaded %s target names from `%s`." % (len(targets), args.targetlist))
+    if args.inputlist:
+        inputs = list()
+        inputPath = args.inputpath.rstrip("/")
+        with open(args.inputlist) as file:
+            for index, line in enumerate(file):
+                name = line.strip()
+                inputs.append(name)
+        print ("Loaded %s input names from `%s`." % (len(inputs), args.inputlist))
+    else:
+        inputs = targets
+        inputPath = targetPath
+    crossReference = dict()
+    with open(args.crossreference) as file:
+        for index, line in enumerate(file):
+            columns = line.split()
+            core = columns[0]
+            partner = columns[-1]
+            if core not in crossReference:
+                crossReference[core] = []
+            crossReference[core].append(partner)
+    print ("Loaded cross reference from `%s`." % args.crossreference)
+    interactions = dict()
+    for targetName in targets:
+        targetFile = "%s/%s" % (targetPath, targetName)
+        matchScores(targetFile=targetFile,
+                    targetName=targetName,
+                    inputs=inputs,
+                    inputPath=inputPath,
+                    crossReference=crossReference,
+                    minScore=args.minscore,
+                    idLength=args.idx,
+                    logFile=logFile,
+                    interactions=interactions)
+    if args.inputlist:
+        for inputName in inputs:
+            inputDirectory = inputPath
+            inputFile = "%s/%s" % (inputDirectory, inputName)
+            matchScores(targetFile=inputFile,
+                        targetName=inputName,
+                        inputs=targets,
+                        inputPath=targetPath,
+                        crossReference=crossReference,
+                        minScore=args.minscore,
+                        idLength=args.idx,
+                        logFile=logFile,
+                        interactions=interactions)
+    interactions = sorted(interactions.values(), key=lambda item: item["minZ"], reverse=True)
+    with open(args.output, 'w') as output_file:
+        for entry in interactions:
+            output_file.write("%s\t%s\t%s\t%s\n" % (entry["targetName"], entry["inputName"], entry["minZ"], entry["minInfo"]))
+    logFile.close()
 
-def matchScores(targetFile, targetName, inputs, inputPath, crossReference, minScore, idLength, interactions):
-	targetTop, targetHits = getTemplateScores(targetFile, minScore, idLength)
-	if not targetHits:
-		print("No targets found `%s`" % targetFile)
-	else:
-		print ("Loaded target scores from `%s`." % targetFile)
-		for inputName in inputs:
-			inputDirectory = inputPath.rstrip("/")
-			inputFile = "%s/%s" % (inputDirectory, inputName)
-			inputTop, inputHits = getTemplateScores(inputFile, minScore, idLength)
-			minZ = 0
-			minInfo = ""
-			for t in targetHits:
-				if t in crossReference:
-					partners = crossReference[t]
-					for p in partners:
-						if p in inputHits:
-							score = min(targetHits[t], inputHits[p])
-							if score > minZ:
-								minZ = score
-								minInfo = "%s\t%s\t%s\t%s" % (targetTop, inputTop, t, p)
-			if minZ > minScore:
-				if targetName > inputName:
-					interactionKey = "%s_%s" % (targetName, inputName)
-				else:
-					interactionKey = "%s_%s" % (inputName, targetName)
-				if interactionKey in interactions:
-					if interactions[interactionKey]["minZ"] >= minZ:
-						continue
-				interactions[interactionKey] = dict(targetName=targetName, inputName=inputName, minZ=minZ, minInfo=minInfo)
-				print("Predicting: %s, min-Z: %s, templates: %s" % (inputName, minZ, minInfo))
-	return interactions
+def matchScores(targetFile, targetName, inputs, inputPath, crossReference, minScore, idLength, logFile, interactions):
+    targetTop, targetHits = getTemplateScores(targetFile, minScore, idLength)
+    if not targetHits:
+        print("No targets found `%s`" % targetFile)
+    else:
+        print ("Loaded target scores from `%s`." % targetFile)
+        for inputName in inputs:
+            inputFile = "%s/%s" % (inputPath, inputName)
+            inputTop, inputHits = getTemplateScores(inputFile, minScore, idLength)
+            minZ = 0
+            minInfo = ""
+            for t in targetHits:
+                if t in crossReference:
+                    partners = crossReference[t]
+                    for p in partners:
+                        if p in inputHits:
+                            score = min(targetHits[t], inputHits[p])
+                            if score > minZ:
+                                minZ = score
+                                minInfo = "%s\t%s\t%s\t%s" % (targetTop, inputTop, t, p)
+            if minZ > minScore:
+                if targetName > inputName:
+                    interactionKey = "%s_%s" % (targetName, inputName)
+                else:
+                    interactionKey = "%s_%s" % (inputName, targetName)
+                if interactionKey in interactions:
+                    if interactions[interactionKey]["minZ"] >= minZ:
+                        continue
+                interactions[interactionKey] = dict(targetName=targetName, inputName=inputName, minZ=minZ, minInfo=minInfo)
+                logFile.write("Interaction between %s and %s [min-Z: %s].\n" % (targetName, inputName, minZ))
 
 def getTemplateScores(hhrFile, minScore, identifierLength):
-	result = dict()
-	topTemplate = None
-	identifierLength = identifierLength + 4
-	if os.path.isfile(hhrFile):
-		with open(hhrFile) as file:
-			for index, line in enumerate(file):
-				if index > 8:
-					if not line.strip():
-						break
-					templateId = line[4:identifierLength]
-					templateScore = float(line[57:63])
-					if templateScore > minScore:
-						if topTemplate is None:
-							topTemplate = templateId
-						result[templateId] = templateScore
-	return topTemplate, result
+    result = dict()
+    topTemplate = None
+    identifierLength = identifierLength + 4
+    if os.path.isfile(hhrFile):
+        with open(hhrFile) as file:
+            for index, line in enumerate(file):
+                if index > 8:
+                    if not line.strip():
+                        break
+                    templateId = line[4:identifierLength]
+                    templateScore = float(line[57:63])
+                    if templateScore > minScore:
+                        if topTemplate is None:
+                            topTemplate = templateId
+                        result[templateId] = templateScore
+    return topTemplate, result
 
 if __name__ == "__main__":
-	parser = argparse.ArgumentParser(description='This script identifies interactions by detecting matching HH-search results.')
-	parser.add_argument('-il', '--inputlist', help='Text file containing identifiers.', required=True)
-	parser.add_argument('-ip', '--inputpath', help='Directory containing `hhr` files', required=True)
-	parser.add_argument('-tl', '--targetlist', help='Text file containing identifiers.', required=True)
-	parser.add_argument('-tp', '--targetpath', help='Directory containing `hhr` files', required=True)
-	parser.add_argument('-c', '--crossreference', help='Cross Reference index file', required=True)
-	parser.add_argument('-x', '--idx', help='Length of identifier', type=int, default=6)
-	parser.add_argument('-o', '--output', help='Output file containing min-Z scores', required=True)
-	parser.add_argument('-m', '--minscore', help='min-Z score threshold', type=int, default=10)
-	args = parser.parse_args()
-	main(args)
\ No newline at end of file
+    parser = argparse.ArgumentParser(description='This script identifies interactions by detecting matching HH-search results.')
+    parser.add_argument('-tl', '--targetlist', help='Text file containing identifiers.', required=True)
+    parser.add_argument('-tp', '--targetpath', help='Directory containing `hhr` files', required=True)
+    parser.add_argument('-il', '--inputlist', help='Text file containing identifiers.', required=False)
+    parser.add_argument('-ip', '--inputpath', help='Directory containing `hhr` files', required=False)
+    parser.add_argument('-c', '--crossreference', help='Cross Reference index file', required=True)
+    parser.add_argument('-x', '--idx', help='Length of identifier', type=int, default=6)
+    parser.add_argument('-o', '--output', help='Output file containing min-Z scores', required=True)
+    parser.add_argument('-l', '--log', help='Log file', required=True)
+    parser.add_argument('-m', '--minscore', help='min-Z score threshold', type=int, default=10)
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file