diff spring_minz.py @ 17:c790d25086dc draft

"planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
author guerler
date Wed, 28 Oct 2020 05:11:56 +0000
parents 16eb2acaaa20
children 5feab7f00f02
line wrap: on
line diff
--- a/spring_minz.py	Sat Oct 24 17:48:06 2020 +0000
+++ b/spring_minz.py	Wed Oct 28 05:11:56 2020 +0000
@@ -3,77 +3,118 @@
 import os
 
 def main(args):
-	names = []
-	with open(args.list) as file:
+	inputs = set()
+	with open(args.inputlist) as file:
 		for index, line in enumerate(file):
-			names.append(line.strip())
-	print ("Loaded %s names from `%s`." % (len(names), args.list))
-	crossreference = {}
+			name = line.strip()
+			inputs.add(name)
+	print ("Loaded %s input names from `%s`." % (len(inputs), args.inputlist))
+	targets = set()
+	duplicates = 0
+	with open(args.targetlist) as file:
+		for index, line in enumerate(file):
+			name = line.strip()
+			targets.add(name)
+			if name in inputs:
+				duplicates = duplicates + 1
+	print ("Loaded %s target names from `%s`." % (len(targets), args.targetlist))
+	crossReference = dict()
 	with open(args.crossreference) as file:
 		for index, line in enumerate(file):
 			columns = line.split()
 			core = columns[0]
 			partner = columns[-1]
-			if core not in crossreference:
-				crossreference[core] = []
-			crossreference[core].append(partner)
+			if core not in crossReference:
+				crossReference[core] = []
+			crossReference[core].append(partner)
 	print ("Loaded cross reference from `%s`." % args.crossreference)
-	toptarget, targets = get_template_scores(args.target, args.minscore, args.idx)
-	interactions = []
-	if not targets:
-		print("No targets found `%s`" % args.target)
+	interactions = dict()
+	for targetName in targets:
+		targetDirectory = args.targetpath.rstrip("/")
+		targetFile = "%s/%s" % (targetDirectory, targetName)
+		matchScores(targetFile=targetFile,
+					targetName=targetName,
+					inputs=inputs,
+					inputPath=args.inputpath,
+					crossReference=crossReference,
+					minScore=args.minscore,
+					idLength=args.idx,
+					interactions=interactions)
+	if duplicates == len(targets):
+		for inputName in inputs:
+			inputDirectory = args.inputpath.rstrip("/")
+			inputFile = "%s/%s" % (inputDirectory, inputName)
+			matchScores(targetFile=inputFile,
+						targetName=inputName,
+						inputs=targets,
+						inputPath=args.targetpath,
+						crossReference=crossReference,
+						minScore=args.minscore,
+						idLength=args.idx,
+						interactions=interactions)
+	interactions = sorted(interactions.values(), key=lambda item: item["minZ"], reverse=True)
+	with open(args.output, 'w') as output_file:
+		for entry in interactions:
+			output_file.write("%s\t%s\t%s\t%s\n" % (entry["targetName"], entry["inputName"], entry["minZ"], entry["minInfo"]))
+
+def matchScores(targetFile, targetName, inputs, inputPath, crossReference, minScore, idLength, interactions):
+	targetTop, targetHits = getTemplateScores(targetFile, minScore, idLength)
+	if not targetHits:
+		print("No targets found `%s`" % targetFile)
 	else:
-		print ("Loaded target scores from `%s`." % args.target)
-		for name in names:
-			input_directory = args.inputs.rstrip("/")
-			input_file = "%s/%s" % (input_directory, name)
-			toptemplate, templates = get_template_scores(input_file, args.minscore, args.idx)
-			minz = 0
-			mint = ""
-			for t in targets:
-				if t in crossreference:
-					partners = crossreference[t]
+		print ("Loaded target scores from `%s`." % targetFile)
+		for inputName in inputs:
+			inputDirectory = inputPath.rstrip("/")
+			inputFile = "%s/%s" % (inputDirectory, inputName)
+			inputTop, inputHits = getTemplateScores(inputFile, minScore, idLength)
+			minZ = 0
+			minInfo = ""
+			for t in targetHits:
+				if t in crossReference:
+					partners = crossReference[t]
 					for p in partners:
-						if p in templates:
-							score = min(targets[t], templates[p])
-							if score > minz:
-								minz = score
-								mint = "%s\t%s\t%s\t%s" % (toptarget, toptemplate, t, p)
-			if minz > args.minscore:
-				interactions.append((name, minz, mint))
-				print("Predicting: %s, min-Z: %s, templates: %s" % (name, minz, mint))
-		interactions.sort(key=lambda tup: tup[1], reverse=True)
-	with open(args.output, 'a+') as output_file:
-		for i in interactions:
-			output_file.write("%s\t%s\t%s\t%s\n" % (args.name, i[0], i[1], i[2]))
+						if p in inputHits:
+							score = min(targetHits[t], inputHits[p])
+							if score > minZ:
+								minZ = score
+								minInfo = "%s\t%s\t%s\t%s" % (targetTop, inputTop, t, p)
+			if minZ > minScore:
+				if targetName > inputName:
+					interactionKey = "%s_%s_%s" % (targetName, inputName, minZ)
+				else:
+					interactionKey = "%s_%s_%s" % (inputName, targetName, minZ)
+				if interactionKey not in interactions:
+					interactions[interactionKey] = dict(targetName=targetName, inputName=inputName, minZ=minZ, minInfo=minInfo)
+				print("Predicting: %s, min-Z: %s, templates: %s" % (inputName, minZ, minInfo))
+	return interactions
 
-def get_template_scores(hhr_file, min_score, identifier_length):
-	result = {}
-	toptemplate = None
-	identifier_length = identifier_length + 4
-	if os.path.isfile(hhr_file):
-		with open(hhr_file) as file:
+def getTemplateScores(hhrFile, minScore, identifierLength):
+	result = dict()
+	topTemplate = None
+	identifierLength = identifierLength + 4
+	if os.path.isfile(hhrFile):
+		with open(hhrFile) as file:
 			for index, line in enumerate(file):
 				if index > 8:
 					if not line.strip():
 						break
-					template_id = line[4:identifier_length]
-					template_score = float(line[57:63])
-					if template_score > min_score:
-						if toptemplate is None:
-							toptemplate = template_id
-						result[template_id] = template_score
-	return toptemplate, result
+					templateId = line[4:identifierLength]
+					templateScore = float(line[57:63])
+					if templateScore > minScore:
+						if topTemplate is None:
+							topTemplate = templateId
+						result[templateId] = templateScore
+	return topTemplate, result
 
 if __name__ == "__main__":
 	parser = argparse.ArgumentParser(description='This script identifies interactions by detecting matching HH-search results.')
-	parser.add_argument('-t', '--target', help='HHR target file result', required=True)
-	parser.add_argument('-n', '--name', help='HHR target name', required=True)
+	parser.add_argument('-il', '--inputlist', help='Text file containing identifiers.', required=True)
+	parser.add_argument('-ip', '--inputpath', help='Directory containing `hhr` files', required=True)
+	parser.add_argument('-tl', '--targetlist', help='Text file containing identifiers.', required=True)
+	parser.add_argument('-tp', '--targetpath', help='Directory containing `hhr` files', required=True)
 	parser.add_argument('-c', '--crossreference', help='Cross Reference index file', required=True)
 	parser.add_argument('-x', '--idx', help='Length of identifier', type=int, default=6)
-	parser.add_argument('-l', '--list', help='Text file containing identifiers.', required=True)
-	parser.add_argument('-i', '--inputs', help='Directory containing `hhr` files', required=True)
-	parser.add_argument('-o', '--output', help='Output file containing min-Z scores`', required=True)
+	parser.add_argument('-o', '--output', help='Output file containing min-Z scores', required=True)
 	parser.add_argument('-m', '--minscore', help='min-Z score threshold', type=int, default=10)
 	args = parser.parse_args()
 	main(args)
\ No newline at end of file