antarna: antaRNA.py comparison

comparison antaRNA.py @ 0:fcf4719d3831 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/antarna/ commit 71414cf7f040d610afc3f02be31446efc3a82a40-dirty

author	rnateam
date	Wed, 13 May 2015 11:02:53 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:fcf4719d3831
+import numpy
+import sys
+import random
+import subprocess
+import re
+import decimal
+import math
+import os
+import shutil
+import time
+import types
+import argparse
+#from argparse import RawTextHelpFormatter
+#############################
+# FUNCTIONS
+def print2file(f, i, m):
+	"""
+		print content i to file f in mode m
+	"""
+	line = str(i)
+	if m == "a":
+		call = "echo \"" +  line + "\" >> " + f
+	elif m == "w":
+		call = "echo \"" +  line + "\" > " + f
+	os.system(call)
+# checking and correcting the alphabet of the constraint sequence
+def checkSequenceConstraint(SC):
+	"""
+		Checks the Sequence constraint for illegal nucleotide characters
+	"""
+	out = ""
+	for c in SC:
+		c = c.upper()
+		if c not in "ACGURYSWKMBDHVN":
+# and c!= "R" and c != "Y" and c != "S" and c != "W" and c != "K" and c != "M" and c != "B" and c != "D" and c != "H" and c != "V":
+			if c == "T":
+				c = "U"
+			else:
+				print "\tIllegal Character in the constraint sequence!"
+				print "\tPlease use the IUPAC nomenclature for defining nucleotides in the constraint sequence!"
+				print "\tA   	Adenine"
+				print "\tC   	Cytosine"
+				print "\tG   	Guanine"
+				print "\tT/U 	Thymine/Uracil"
+				print "\tR 	A or G"
+				print "\tY 	C or T/U"
+				print "\tS 	G or C"
+				print "\tW 	A or T/U"
+				print "\tK 	G or T/U"
+				print "\tM 	A or C"
+				print "\tB 	C or G or T/U"
+				print "\tD 	A or G or T/U"
+				print "\tH 	A or C or T/U"
+				print "\tV 	A or C or G"
+				print "\tN	any base"
+				exit(0)
+		out += c
+	return (1, out)
+def transform(seq):
+	"""
+		Transforms "U" to "T" for the processing is done on DNA alphabet
+	"""
+	S = ""
+	for s in seq:
+		if s == "T":
+			S += "U"
+		else:
+			S += s
+	return S
+def checkSimilarLength(s, SC):
+	"""
+		Compares sequence and structure constraint length
+	"""
+	if len(s) == len(SC):
+		return 1
+	else:
+		return 0
+def isStructure(s):
+	"""
+		Checks if the structure constraint only contains "(", ")", and "." and legal fuzzy structure constraint characters.
+	"""
+	returnvalue = 1
+	for a in range(0,len(s)):
+		if s[a] not in  ".()[]{}<>":
+			if s[a] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+				returnvalue = 0
+	return returnvalue
+def isBalanced(s):
+	"""
+		Check if the structure s is of a balanced nature
+	"""
+	balance = 1
+	for bracket in ["()", "[]", "{}", "<>"]:
+		counter = 0
+		for a in xrange(len(s)):
+			if s[a] in bracket[0]:
+				counter += 1
+			elif s[a] in bracket[1]:
+				counter -= 1
+		if counter != 0:
+			balance = 0
+	return balance
+def fulfillsHairpinRule(s):
+	"""
+		CHECKING FOR THE 3 nt LOOP INTERSPACE
+			for all kind of basepairs, even wihtin the pdeudoknots
+	"""
+	fulfillsRules = 1
+	for bracket in ["()", "[]", "{}", "<>"]:
+		last_opening_char = 0
+		check = 0
+		for a in xrange(len(s)):
+			if s[a] == bracket[0]:
+				last_opening_char = a
+				check = 1
+			elif s[a] == bracket[1] and check == 1:
+				check = 0
+				if a - last_opening_char < 4:
+					return 0
+	return 1
+def isValidStructure(s):
+	"""
+		Checks, if the structure s is a valid structure
+	"""
+	Structure = isStructure(s)
+	Balanced = isBalanced(s)
+	HairpinRule = fulfillsHairpinRule(s)
+	if Structure == 1 and Balanced == 1 and HairpinRule == 1:
+		return 1
+	else:
+		print Structure, Balanced, HairpinRule
+		return 0
+def loadIUPACcompatibilities(IUPAC, useGU):
+	"""
+		Generating a hash containing all compatibilities of all IUPAC RNA NUCLEOTIDES
+	"""
+	compatible = {}
+	for nuc1 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE
+		sn1 = list(IUPAC[nuc1])
+		for nuc2 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE
+			sn2 = list(IUPAC[nuc2])
+			compatib = 0
+			for c1 in sn1: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE:
+				for c2 in sn2: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE:
+					# CHECKING THEIR COMPATIBILITY
+					if useGU == True:
+						if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C") or (c1 == "G" and c2 == "U") or (c1 == "U" and c2 == "G"):
+							compatib = 1
+					else:
+						if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C"):
+							compatib = 1
+			compatible[nuc1 + "_" + nuc2] = compatib # SAVING THE RESPECTIVE GROUP COMPATIBILITY, REVERSE SAVING IS NOT REQUIRED, SINCE ITERATING OVER ALL AGAINST ALL
+	return compatible
+def isCompatibleToSet(c1, c2, IUPAC_compatibles):
+	"""
+		Checks compatibility of c1 wihtin c2
+	"""
+	compatible = True
+	for setmember in c2:
+		#print setmember
+		if isCompatible(c1, setmember, IUPAC_compatibles) == False:
+			return False
+	return compatible
+def isCompatible(c1, c2, IUPAC_compatibles):
+	"""
+		Checks compatibility between character c1 and c2
+	"""
+	if IUPAC_compatibles[c1 + "_" + c2] == 1:
+		return True
+	else:
+		return False
+def isStructureCompatible(lp1, lp2 ,bp):
+	"""
+		Checks, if the region within lp1 and lp2 is structurally balanced
+	"""
+	x = lp1 + 1
+	while (x < lp2):
+		if (bp[x] <= lp1 or bp[x] > lp2):
+			return False
+		if x == bp[x]:
+			x += 1
+		else:
+			x = bp[x] + 1
+	return x == lp2
+def checkConstaintCompatibility(basepairstack, sequenceconstraint, IUPAC_compatibles):
+	"""
+		Checks if the constraints are compatible to each other
+	"""
+	returnstring = ""
+	compatible = 1
+	for id1 in basepairstack:  # key = (constraint , (pos, constraint)))
+		constr1 = basepairstack[id1][0]
+		id2 = basepairstack[id1][1][0]
+		constr2 = basepairstack[id1][1][1]
+		if id1 != id2 and not isCompatible(constr1, constr2, IUPAC_compatibles):
+			compatible = 0
+			returnstring += "nucleotide constraint " + str(constr1) + " at position " + str(id1) + " is not compatible with nucleotide constraint " + str(constr2) + " at position " + str(id2) + "\n"
+#if not isCompatible(basepairstack[basepair][0], basepairstack[basepair][1][1]):
+#compatible = 0
+#else:
+#returnstring += "nucleotide constraint " + str(basepairstack[basepair][0]) + " at position " + str(basepair) + " is compatible with nucleotide constraint " + str(basepairstack[basepair][1][1]) + " at position " + str(basepairstack[basepair][1][0]) + "\n"
+	return (compatible, returnstring)
+def getLP(BPSTACK):
+"""
+Retreives valid lonley base pairs from a base pair stack
+"""
+#20 ('N', (>BLOCK<, 'N'))
+# geting single base pairs
+stack = {}
+LP = {}
+if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType:
+for i in BPSTACK.keys():
+#if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+stack[i] = int(BPSTACK[i][1][0])
+	#print i , BPSTACK[i][1][0]
+else:
+for i in BPSTACK.keys():
+#if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+stack[i] = BPSTACK[i]
+# removing redundant base pair indices
+for i in stack.keys():
+if i >= stack[i]:
+	del stack[i]
+# actual checking for single lonley base pairs
+for i in stack.keys():
+if not (i-1 in stack and stack[i-1] == stack[i] + 1) and not (i+1 in stack and stack[i+1] == stack[i] - 1):
+LP[i] = stack[i]
+##actual removal of 2er lonley base pairs
+for i in stack.keys():
+if not (i-1 in stack and stack[i-1] == stack[i] + 1) and  (i+1 in stack and stack[i+1] == stack[i] - 1) and not (i+2 in stack and stack[i+2] == stack[i] - 2):
+LP[i] = stack[i]
+LP[i+1] = stack[i+1]
+#if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType:
+#for i in BPSTACK.keys():
+##if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+#stack[i] = int(BPSTACK[i][1][0])
+	##print i , BPSTACK[i][1][0]
+#else:
+#for i in BPSTACK.keys():
+##if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+#stack[i] = BPSTACK[i]
+#for i in stack.keys():
+#if i >= stack[i]:
+	#del stack[i]
+return LP
+def getBPStack(s, seq):
+	"""
+		Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq.
+	"""
+	tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]}
+	bpstack = {}
+	for i in xrange(len(s)):
+# REGULAR SECONDARY STRUCTURE DETECTION
+		if s[i] in "(){}[]<>":
+			no = 0
+			### opening
+			if s[i] in "([{<":
+				if s[i] == "(":
+					tmp_stack["()"].append((i, seq[i]))
+				elif s[i] == "[":
+					tmp_stack["[]"].append((i, seq[i]))
+				elif s[i] == "{":
+					tmp_stack["{}"].append((i, seq[i]))
+				elif s[i] == "<":
+					tmp_stack["<>"].append((i, seq[i]))
+			#closing
+			elif s[i] in ")]}>":
+				if s[i]  == ")":
+					no, constr = tmp_stack["()"].pop()
+				elif s[i]  == "]":
+					no, constr = tmp_stack["[]"].pop()
+				elif s[i]  == "}":
+					no, constr = tmp_stack["{}"].pop()
+				elif s[i]  == ">":
+					no, constr = tmp_stack["<>"].pop()
+				bpstack[no] = (constr, (i, seq[i]))
+				bpstack[i] = (seq[i] ,(no, constr))
+		elif s[i] == ".":
+			bpstack[i] = (seq[i], (i, seq[i]))
+		elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+			bpstack[i] = (seq[i], (i, seq[i]))
+	return (bpstack, getLP(bpstack))
+def getbpStack(s):
+	"""
+		Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq.
+	"""
+	tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]}
+	bpstack = {}
+	for i in xrange(len(s)):
+		if s[i] in "(){}[]<>":
+			no = 0
+			### opening
+			if s[i] in "([{<":
+				if s[i] == "(":
+					tmp_stack["()"].append(i)
+				elif s[i] == "[":
+					tmp_stack["[]"].append(i)
+				elif s[i] == "{":
+					tmp_stack["{}"].append(i)
+				elif s[i] == "<":
+					tmp_stack["<>"].append(i)
+			#closing
+			elif s[i] in ")]}>":
+				if s[i] == ")":
+					no = tmp_stack["()"].pop()
+				elif s[i] == "]":
+					no = tmp_stack["[]"].pop()
+				elif s[i] == "}":
+					no = tmp_stack["{}"].pop()
+				elif s[i] == ">":
+					no = tmp_stack["<>"].pop()
+				bpstack[no] = i # save basepair in the format {opening base id (opening seq constr,(closing base id, closing seq constr))}
+				bpstack[i] = no # save basepair in the format {closing base id (closing seq constr,(opening base id, opening seq constr))}
+		elif s[i] == ".": # no structural constaint given: produce entry, which references itself as a base pair partner....
+			bpstack[i] = i
+		elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+			bpstack[i] = i
+#elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+## per position, assigned to a certain block, the target nucleotide, with whcih it should interact is marked with the specific
+## block character
+#bpstack[i] = s[i]
+	return (bpstack, getLP(bpstack))
+def maprange( a, b, s):
+"""
+Mapping function
+"""
+(a1, a2), (b1, b2) = a, b
+return  b1 + ((s - a1) * (b2 - b1) / (a2 - a1))
+def applyGCcontributionPathAdjustment(pathlength, tmpGC, nt):
+	"""
+		GC path length contribution calculation.
+	"""
+	GCadjustment = 1.5
+	minimum = 0.5
+	upper = GCadjustment
+	lower = minimum
+	if nt == "A" or nt == "U":
+		pathlength = pathlength * maprange( (0, 1) , (lower, upper), tmpGC)
+	if nt == "G" or nt == "C":
+		#pathlength = pathlength * (float(1-tmpGC))
+		pathlength = pathlength * maprange( (1, 0) , (lower, upper), tmpGC)
+	return pathlength
+def getConstraint(TE, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements):
+	"""
+	Dependend on the situation in the constraint an the respective path section, setting wether a specific constraint can be given or not (for that path section)
+	"""
+	# TE :: transition element / path section under dispute
+	# id1 :: id of the position of the caharacter to which the transition is leading to
+	# id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present
+	# val :: BPstack information of the specific position
+	# constr1 :: constraining character of pos id1
+	# constr2 :: constraining character of pos id2
+	id1 = int(TE.split(".")[0])
+	val = BPstack[id1] # check out the value of the destination character in the basepair/constraint stack
+	constr1 = val[0] # getting the constraint character of position id1
+	id2 = int(val[1][0]) # getting position id2
+	constr2 = val[1][1] # getting the sequence constraint for position id2
+	targetNucleotide = TE.split(".")[1][-1:] # where the edge is leading to
+	c1 = set(IUPAC[constr1]) # getting all explicit symbols of c1
+	c2 = set(IUPAC_reverseComplements[constr2]) # getting the reverse complement explicit symbols of c2
+	if targetNucleotide in c1:
+		if id1 == id2:
+			return 1
+		else:
+			if targetNucleotide in c2:
+				return 1
+			else:
+				return 0
+	else:
+		return 0
+"""
+def getConstraint(TE, BPstack):
+# TE :: transition element / path section
+# id1 :: id of the position of the caharacter to which the transition is leading to
+# id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present
+# val :: BPstack information of the specific position
+# constr1 :: constraining character of pos id1
+# constr2 :: constraining character of pos id2
+### BPstack [id1] = (constr1, (id2, constr2))
+id1 = TE.split(".")[0]
+#print id1
+#id1 = TE.find(TE.strip("_")) #  strip the path section and getting the position of the section
+#if len(TE.strip("_")) == 2: # check if the path section is from an internal and not an initial transition
+#id1 += 1 # increase position id1 by 1, since the last character of the section is the destination character
+val = BPstack[int(id1)] # check out the value of the destination character in the basepair/constraint stack
+constr1 = val[0] # getting the constraint character of position id1
+id2 = val[1][0] # getting position id2
+constr2 = val[1][1] # getting the sequence constraint for position id2
+#print TE, id1, constr1, id2, constr2,
+#TE.split(".")[1][-1:]
+if id1 == id2: # both ids were the same with either character, sequential or no sequential constraint -> no basepair constraint
+if constr1 == TE.split(".")[1][-1:] and constr2 == TE.split(".")[1][-1:]: # case if the single base constraints on position id1 == id2 are the same as the destination character on id1
+#print 1
+return 1
+elif constr1 == constr2 == "N": # case if the single base constraints on position id1 == id2 has no constraint
+#print 1
+return 1
+else: # single base sequence constraints differ
+#print 0
+return 0
+elif id1 != id2: # showing differentq ids, indicating a bp, (basepair structural constraint)
+if constr1 == "N" and constr2 == "N": # no sequence constraint
+#print 1
+return 1
+if constr1 == "N" and constr2 != "N": # c1 has no constraint, c2 has character constraint (sequence constraint of closing bases)
+if TE.split(".")[1][-1:] == complementBase(constr2): # the current path section destination base is equal to the complement base of the mentioned sequence constraint in constr2
+	#print 1
+	return 1
+else: # case if the current path section destination base is not equeal to the mentioned complement sequence constraint in constr2
+	#print 0
+	return 0
+if constr1 != "N" and constr2 == "N": # c1 has character constraint, c2 has no character constraint (sequence constraint in the opening base)
+if TE.split(".")[1][-1:] == constr1:  # the current path section destination base is as constrained with constr1
+	#print 1
+	return 1
+else: # the current path section destination base is not as constrained in constr1
+	#print 0
+	return 0
+if constr1 != "N" and constr2 != "N": # both positions have sequential constraint
+if TE.split(".")[1][-1:] == constr1:
+	#print 1
+	return 1
+else:
+	#print 0
+	return 0
+"""
+def applyTerrainModification(terrain, s, tmpGC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements):
+	#nucleotides = {'A': 0, 'C': 1,'G': 2,'T': 3}
+	dels = []
+	for terrainelement in sorted(terrain):
+		pheromone, pathlength = terrain[terrainelement]
+		pheromone = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
+		pathlength = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
+		pathlength = applyGCcontributionPathAdjustment(pathlength, tmpGC,terrainelement.split(".")[1][-1:])
+		if pheromone * pathlength == 0: dels.append(terrainelement)
+		terrain[terrainelement] = (pheromone, pathlength,[])
+	further_dels = {}
+	for terrainelement in sorted(dels):
+		pos, nucs = terrainelement.split(".")
+		if int(pos) < len(s)-1:
+			to_nt = nucs[-1:]
+			successor_pos = int(pos) + 1
+			for i in ["A", "C", "G", "U"]:
+				del_element = str(successor_pos) + "." + to_nt + i
+				further_dels[del_element] = 1
+		further_dels[terrainelement] = 1
+	# deleting the inbound and outbound edges, which are forbidden
+	for terrainelement in further_dels:
+		del terrain[terrainelement]
+	# allocate the appropriate children of edges
+	for terrainelement in terrain:
+		pheromone, pathlength, children = terrain[terrainelement]
+		pos, nucs = terrainelement.split(".")
+		if int(pos) < len(s):
+			to_nt = nucs[-1:]
+			successor_pos = int(pos) + 1
+			for i in ["A", "C", "G", "U"]:
+				if str(successor_pos) + "." + to_nt + i in terrain:
+					children.append(str(successor_pos) + "." + to_nt + i)
+		terrain[terrainelement] = (pheromone, pathlength,children)
+	starts = []
+	for i in ["A", "C", "G", "U"]:
+		if str(0) + "." + i in terrain:
+			starts.append(str(0) + "." + i)
+	terrain["00.XY"] = (1, 1, starts)
+	return (terrain, BPstack)
+def initTerrain(s): # THE CLASSIC
+	"""
+		Initialization of the terrain with graph like terrain... vertices are modeled implicitly
+	"""
+	nt = ["A","C","G","U"]
+	nt2 = ["AA","AC","AG","AU","CA","CC","CG","CU","GA","GC","GG","GU","UA","UC","UG","UU"] # Allowed dinucleotides
+	e = {}
+	pathlength = 1
+	pheromone = 1
+	for p in xrange(len(s)):
+		if p == 0:
+			for i in nt:
+				e["%s.%s"%(p,i)] = (pheromone, pathlength)
+		elif p > 0:
+			for n in nt2:
+				e["%s.%s"%(p,n)] = (pheromone, pathlength)
+	return e
+def complementBase(c):
+	"""
+		Returns the complement RNA character of c (without GU base pairs)
+	"""
+	retChar = ""
+	if c == "A" :
+		retChar = "U"
+	elif c == "U":
+		retChar = "A"
+	elif c == "C":
+		retChar = "G"
+	elif c == "G":
+		retChar = "C"
+	return retChar
+def printTerrain(terrain):
+	#print sorted(terrain.keys())
+	tmp_i = "0"
+	tmp_c = 0
+	terrain = terrain[0]
+	for a, i in enumerate(sorted(terrain.keys())):
+		#print a
+		if i.split(".")[0] != tmp_i:
+			print "\nElements:", tmp_c,"\n#########################\n", i, terrain[i]
+			tmp_c = 1
+			tmp_i = i.split(".")[0]
+		else:
+			print i, terrain[i]
+			tmp_c += 1
+	print "\nElements:", tmp_c
+	print "#########################"
+	print len(terrain)
+def pickStep(tmp_steps, summe):
+	"""
+		Selects a step within the terrain
+	"""
+	if len(tmp_steps) == 1:
+		return tmp_steps[0][1] # returning the nucleotide of the only present step
+	else:
+		rand = random.random() # draw random number
+		mainval = 0
+		for choice in xrange(len(tmp_steps)):
+			val, label = tmp_steps[choice]
+			mainval += val/float(summe)
+			if mainval > rand: # as soon, as the mainval gets larger than the random value the assignment is done
+				return label
+def getPath(s, tmp_terrain, tmp_BPstack, alpha, beta, IUPAC, IUPAC_reverseComplements):
+	"""
+		Performs a walk through the terrain and assembles a sequence, while respecting the structure constraint and IUPAC base complementarity
+		of the base pairs GU, GC and AT
+	"""
+	nt = ["A","C","G","U"]
+	prev_edge = "00.XY"
+	sequence = ""
+	while len(sequence) <  len(s):
+		coming_from = sequence[-1:]
+		summe = 0
+		steps = []
+		i = len(sequence)
+		allowed_nt = "ACGU"
+		# base pair closing case check, with subsequent delivery of a reduced allowed nt set
+		if i > tmp_BPstack[i][1][0]:
+			jump =  tmp_BPstack[i][1][0]
+			nuc_at_jump = sequence[jump]
+			allowed_nt = IUPAC_reverseComplements[nuc_at_jump]
+			#allowed_nt = complementBase(nuc_at_jump)
+		# Checking for every possible nt if it is suitable for the selection procedure
+		for edge in tmp_terrain[prev_edge][-1]:
+			if edge[-1:] in allowed_nt:
+				pheromone, PL , children = tmp_terrain[edge]
+				#if PL > 0:
+				value = ((float(pheromone * alpha)) + ((1/float(PL)) * beta))
+				summe += value
+				steps.append((value, edge))
+		prev_edge = pickStep(steps, summe)
+		sequence += prev_edge[-1:]
+	return sequence
+###
+# STRUCTURE PREDICTORS
+###
+def getPKStructure(sequence, temperature, mode = "A"):
+	"""
+		Initialization pKiss mfe pseudoknot prediction
+	"""
+	p2p = "pKiss"
+	#p2p = "/usr/local/pkiss/2014-03-17/bin/pKiss_mfe"
+	strategy = "--strategy "
+	t = "--temperature " + str(temperature)
+	if mode == "A": strategy += "A"
+	elif mode == "B": strategy += "B"
+	elif mode == "C": strategy += "C"
+	elif mode == "D": strategy += "D"
+	elif mode == "P": strategy += "P"
+	p = subprocess.Popen( ([p2p, "--mode mfe", strategy, t]),
+				#shell = True,
+				stdin = subprocess.PIPE,
+				stdout = subprocess.PIPE,
+				stderr = subprocess.PIPE,
+				close_fds = True)
+	#print p.stderr.readline()
+	p.stdin.write(sequence+'\n')
+	pks = p.communicate()
+	structure = "".join(pks[0].split("\n")[2].split(" ")[-1:])
+	return structure
+def init_RNAfold(version, temperature, paramFile = ""):
+	"""
+		Initialization RNAfold listener
+	"""
+	p2p = ""
+	t = "-T " + str(temperature)
+	P = ""
+	if paramFile != "":
+		P = "-P " + paramFile
+	if version == 185:
+		p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAfold"
+		p = subprocess.Popen( ([p2p, '--noPS', '-d 2', t, P]),
+					shell = True,
+					stdin = subprocess.PIPE,
+					stdout = subprocess.PIPE,
+					stderr = subprocess.PIPE,
+					close_fds = True)
+		return p
+	elif version == 213:
+		p2p = "RNAfold"
+		p = subprocess.Popen( ([p2p, '--noPS', '-d 2', t, P]),
+					#shell = True,
+					stdin = subprocess.PIPE,
+					stdout = subprocess.PIPE,
+					stderr = subprocess.PIPE,
+					close_fds = True)
+		return p
+	else:
+		exit(0)
+def consult_RNAfold(seq, p):
+	"""
+		Consults RNAfold listener
+	"""
+	p.stdin.write(seq+'\n')
+	out = ""
+	for i in xrange(2):
+		out += p.stdout.readline()
+	return out
+def getRNAfoldStructure(struct2, process1):
+	"""
+		Retrieves folded structure of a RNAfold call
+	"""
+	RNAfold_pattern = re.compile('.+\n([.()]+)\s.+')
+	#RNAdist_pattern = re.compile('.*\s([\d]+)')
+	RNAfold_match = RNAfold_pattern.match(consult_RNAfold(struct2, process1))
+	current_structure = ""
+	#if RNAfold_match:
+	return RNAfold_match.group(1)
+def init_RNAdistance():
+	"""
+		Initialization of RNAdistance listener
+	"""
+	#p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAdistance"
+	p2p = "RNAdistance"
+	p = subprocess.Popen( ([p2p]),
+							#shell = True,
+							stdin = subprocess.PIPE,
+							stdout = subprocess.PIPE,
+							stderr = subprocess.PIPE,
+							close_fds = True)
+	return p
+def consult_RNAdistance(s1, s2, p):
+	"""
+		Consulting the RNAdistance listener
+	"""
+	p.stdin.write(s1+'\n')
+	p.stdin.write(s2+'\n')
+	out = ""
+	out_tmp = p.stdout.readline().strip()
+	if out_tmp != "":
+		out += out_tmp
+	return out
+def getInducingSequencePositions(Cseq, degreeOfSequenceInducement):
+	"""
+		Delimiting the degree of structure inducement by the supplied sequence constraint.
+		0 : no sequence induced structure constraint
+		1 : "ACGT" induce structure (explicit nucleotide structure inducement level)
+		2 : "MWKSYR" and "ACGT" (explicit and double instances)
+		3 : "BDHV" , "MWKSYR" and "ACGT" (explicit, double, and triple instances)
+	"""
+	setOfNucleotides = "" # resembling the "0"-case
+	if degreeOfSequenceInducement == 1:
+		setOfNucleotides = "ACGU"
+	elif degreeOfSequenceInducement == 2:
+		setOfNucleotides = "ACGUMWKSYR"
+	elif degreeOfSequenceInducement == 3:
+		setOfNucleotides = "ACGUMWKSYRBDHV"
+	#elif degreeOfSequenceInducement == 4:
+		#setOfNucleotides = "ACGTMWKSYRBDHVN"
+	tmpSeq = ""
+	listset = setOfNucleotides
+	for pos in Cseq:
+		if pos not in listset:
+			tmpSeq += "N"
+		else:
+			tmpSeq += pos
+	return setOfNucleotides, tmpSeq
+def getBPDifferenceDistance(stack1, stack2):
+	"""
+		Based on the not identical amount of base pairs within both structure stacks
+	"""
+	d = 0
+	for i in stack1.keys():
+		# check base pairs in stack 1
+		if i < stack1[i] and stack1[i] != stack2[i]:
+			d += 1
+		# check base pairs in stack 2
+	for i in stack2.keys():
+		if i < stack2[i] and stack1[i] != stack2[i]:
+			d += 1
+	return d
+def getStructuralDistance(target_structure, Cseq,  path, RNAfold, verbose, LP, BP, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy):
+	"""
+		Calculator for Structural Distance
+	"""
+	# fold the current solution's sequence to obtain the structure
+	current_structure = ""
+	if pseudoknots:
+		current_structure = getPKStructure(path,strategy)
+	else:
+		RNAfold_match = RNAfold_pattern.match(consult_RNAfold(path, RNAfold))
+		current_structure = RNAfold_match.group(1)
+	# generate the current structure's base-pair stack
+	bp = getbpStack(current_structure)[0]
+	# add case-dependend structural constraints in case of lonley basepairs formation
+	tmp_target_structure_bp = getbpStack(target_structure)[0]
+	for lp in LP:
+		if bp[lp] == LP[lp]: # if the base pair is within the current solution structure, re-add the basepair into the constraint structure.
+			#tmp_target_structure[lp] = "("
+			#tmp_target_structure[LP[lp]] = ")"
+			tmp_target_structure_bp[lp] = LP[lp]
+			tmp_target_structure_bp[LP[lp]] = lp
+	# REMOVE BLOCK CONSTRAINT AND SUBSTITUTE IT WITH SINGLE STRAND INFORMATION repsective with brackets, if allowed base pairs occure
+	# check for all allowed implicit constraint block declarators
+	for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+		occurances = []
+		for m in re.finditer(c, target_structure): # search for a declarator in the requested structure
+			occurances.append(m.start()) # save the corresponding index
+		# transform declarator into single stranded request
+		for i in occurances:
+			#tmp_target_structure[i] = "."
+			tmp_target_structure_bp[i] = i
+		# infer a base pair within the block declarated positions, if the current structure provides it.
+		for i in occurances:
+			for j in occurances:
+				if i < j:
+					if bp[i] == j:
+						#tmp_target_structure[i] = "("
+						#tmp_target_structure[bp[i]] = ")"
+						tmp_target_structure_bp[i] = bp[i]
+						tmp_target_structure_bp[bp[i]] = i
+	# CHECK FOR SEQUENCE CONSTRAINT WHICH INDUCES STRUCTURE CONSTRAINT IN THE MOMENTARY SITUATION
+	#print "Checking Cseq influence and it's induced basepairs..."
+	IUPACinducers, tmp_Cseq = getInducingSequencePositions(Cseq, degreeOfSequenceInducement)
+	if len(Cseq.strip("N")) > 0:
+#print "Processing Cseq influence"
+		# Iterate over all positions within the Base Pair stack
+		for i in BP: # Check for each base index i
+			if i < bp[i]: # if the current index is samller that the affiliated in the basepair stack of the current solution
+				bp_j = bp[i] # Actual j index of the current solution
+				BP_j = BP[i][1][0] # j index of the requested structure
+				if (i != bp_j and i == BP_j and BP[i][0] in IUPACinducers ): # if i pairs with some other base in the current structure, and i is requested single stranded and the Sequence constraint is allowed to induce...
+					if (BP[bp_j][1][0] == bp_j and BP[bp_j][0] in IUPACinducers):# If position j is requested singlestranded and position j nucleotide can induce base pairs
+						#if isCompatible(bp[i][0], bp[i][1][1], IUPAC_compatibles): # If both nucleotides, i and j are actually compatible
+						#tmp_target_structure[i] = "("
+						#tmp_target_structure[bp_j] = ")"
+						tmp_target_structure_bp[i] = bp[i]
+						tmp_target_structure_bp[bp_j] = i
+	#tts = "".join(tmp_target_structure)
+	dsreg = getBPDifferenceDistance(tmp_target_structure_bp, bp)
+	# CHECK FOR ALL DETERMINED LONELY BASE PAIRS (i<j), if they are formed
+	failLP = 0
+	for lp in LP:
+		if bp[lp] != LP[lp]:
+			isComp = isCompatible(path[lp],path[LP[lp]], IUPAC_compatibles)
+			isStru = isStructureCompatible(lp, LP[lp] ,bp)
+			if not ( isStru and isStru ): # check if the bases at the specific positions are compatible and check if the
+		# basepair can be formed according to pseudoknot free restriction. If one fails, a penalty distance is raised for that base pair
+				failLP += 1
+	#print dsreg, failLP, float(len(tmp_target_structure_bp))
+	dsLP = float(failLP)
+	return (dsreg + dsLP) /float(len(tmp_target_structure_bp))
+def getGC(sequence):
+	"""
+		Calculate GC content of a sequence
+	"""
+	GC = 0
+	for nt in sequence:
+		if nt == "G" or nt == "C":
+			GC = GC + 1
+	GC = GC/float(len(sequence))
+	return GC
+def getGCDistance(tGC, gc2, L):
+"""
+Calculate the pseudo GC content distance
+"""
+nt_coeff = L * tGC
+pc_nt = (1/float(L))*100
+#
+d = gc2 - tGC
+d = d * 100
+f = math.floor(nt_coeff)
+c = math.ceil(nt_coeff)
+if d < 0: #
+#print "case x",(abs(nt_coeff - f)), pc_nt, (abs(nt_coeff - f)) * pc_nt,
+d = d + (abs(nt_coeff - f)) * pc_nt
+elif d > 0: # case y
+#print "case y", abs(nt_coeff - c), pc_nt, abs(nt_coeff - c) * pc_nt,
+d = d - abs(nt_coeff - c) * pc_nt
+elif d == 0:
+pass
+d = round(d, 7)
+#d = max(0, abs(d)- ( max ( abs( math.ceil(nt_coeff)-(nt_coeff)) , abs(math.floor(nt_coeff)-(nt_coeff)) )/L)*100 )
+return abs(d)
+def getSequenceEditDistance(SC, path):
+"""
+Calculate sequence edit distance of a solution to the constraint
+"""#
+IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"}
+edit = 0
+for i in xrange(len(SC)):
+if path[i] not in IUPAC[SC[i]]:
+edit += 1
+return edit/float(len(path))
+def getTransitions(p):
+	"""
+		Retreive transitions of a specific path/sequence
+	"""
+	transitions = []
+	for pos in xrange(len(p)):
+		if pos == 0:
+			transitions.append(str(pos) + "." + p[pos])
+		else:
+			insert = p[pos-1] + p[pos]
+			transitions.append(str(pos) + "." + insert)
+	return transitions
+def evaporate(t, er):
+	"""
+	Evaporate the terrain's pheromone trails
+	"""
+	terr, BP = t
+	c = 1
+	for key in terr:
+		p,l,c = terr[key]
+		p *= (1-er)
+		terr[key] = (p, l, c)
+def updateValue(distance, correction_term, omega):
+"""
+Retrieves a distance dependend pheromone value
+"""
+if correction_term == 0:
+return 0
+else:
+if distance == 0:
+return omega * correction_term
+else:
+return (1/float(distance)) * correction_term
+def trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose):
+	"""
+		Pheromone Update function accorinding to the quality of the solution
+	"""
+	terr, BP = t
+	bpstack, LP = getbpStack(c_s)
+	struct_correction_term , GC_correction_term, seq_correction_term = correction_terms
+	omega = 2.23
+	bs = updateValue(ds, struct_correction_term, omega)
+	bGC = updateValue(dgc, GC_correction_term, omega)
+	if dseq != "n.a.":
+		bSeq = updateValue(dseq, seq_correction_term, omega)
+		d = bs + bGC + bSeq
+	else:
+		d = bs + bGC
+	transitions = getTransitions(p)
+	for trans in xrange(len(transitions)): # for each transition in the path
+		id1 = int(transitions[trans].split(".")[0])
+		tar_id2 = int(BPstack[id1][1][0]) # getting requested  position id2
+		curr_id2 = int(bpstack[id1]) # getting the current situation
+		multiplicator = 0
+		if tar_id2 == curr_id2 and id1 != tar_id2 and id1 != curr_id2: # case of a base pair, having both brackets on the correct position
+			multiplicator = 1
+		elif tar_id2 == curr_id2 and id1 == tar_id2 and id1 == curr_id2: # case of a single stranded base in both structures
+			multiplicator = 1
+		p, l, c = terr[transitions[trans]] # getting the pheromone and the length value of the single path transition
+		p +=  d * multiplicator
+		terr[transitions[trans]] = (p, l, c) # updating the values wihtin the terrain's
+	t = (terr, BP)
+def updateTerrain(p, c_s, s, ds, dgc, dseq, dn, t, er, correction_terms, BPstack, verbose, ant_count):
+	"""
+		General updating function
+	"""
+	evaporate(t,er)
+	trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose)
+def getUsedTime(start_time):
+	"""
+		Return the used time between -start time- and now.
+	"""
+	end_time = time.time()
+	return end_time - start_time
+def good2Go(SC, L, CC, STR):
+	"""
+		Check, if all input is correct and runnable
+	"""
+	if (SC == 1 and L == 1 and CC == 1 and STR == 1):
+		return True
+	else:
+		print SC,L,CC,STR
+		return False
+def getPathFromSelection( aps, s, terrain, alpha, beta,  RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy):
+	"""
+		Returns the winning path from a selection of pathes...
+	"""
+	terr, BPs = terrain
+	win_path = 0
+	for i in xrange(aps):
+		# Generate Sequence
+		path = getPath(s, terr, BPs, alpha, beta, IUPAC, IUPAC_reverseComplements)
+		# Measure sequence features and transform them into singular distances
+		distance_structural = float(getStructuralDistance(s, SC , path, RNAfold, verbose, LP, BPs, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy))
+		distance_GC = float(getGCDistance(GC,getGC(path), len(path)))
+		distance_seq = float(getSequenceEditDistance(SC, path))
+		# Calculate Distance Score
+		D = distance_structural + distance_GC + distance_seq
+		# SELECT THE BEST-OUT-OF-k-SOLUTIONS according to distance score
+		if i == 0:
+			win_path = (path, D, distance_structural, distance_GC, distance_seq)
+		else:
+			if D < win_path[1]:
+				win_path = (path, D, distance_structural, distance_GC, distance_seq)
+	return win_path
+def substr(x, string, subst):
+	"""
+		Classical substring function
+	"""
+	s1 = string[:x-1]
+	s2 = string[x-1:x]
+	s3 = string[x:]
+#s2 = s[x+len(string)-x-1:]
+	return s1 + subst + s3
+def inConvergenceCorridor(d_struct, d_gc, BS_d_struct, BS_d_gc):
+	"""
+		Check if a solutions qualities are within the convergence corridor
+	"""
+	struct_var = ((BS_d_struct/float(4)) + 3 ) * 4
+	gc_var = (BS_d_gc + 1/float(100) * 5) + BS_d_gc + 1
+	if d_struct <= struct_var and d_gc <= gc_var:
+		return True
+	else:
+		return False
+def getGCSamplingValue(GC, tGCmax, tGCvar):
+	"""
+	Returns a suitable GC value, dependend on the user input: Either returning the single GC value,
+	which the user entered, or a smpled GC value
+	from a designated distribution in it's interavals
+	"""
+	returnval = 0
+	if tGCmax == -1.0 and tGCvar == -1.0: # regular plain tGC value as requested
+		return GC
+	elif tGCmax != -1.0 and tGCvar == -1.0: # uniform distribution tGC value sampling
+		if GC < tGCmax:
+			tmp_GC = tGCmax
+			tGCmax = GC
+			GC = tmp_GC
+		while returnval <= 0:
+			returnval = float(numpy.random.uniform(low=GC, high=tGCmax, size=1))
+		return returnval
+	elif tGCmax == -1.0 and tGCvar != -1.0: # normal distribution tGC value sampling
+		while returnval <= 0:
+			returnval = float(numpy.random.normal(GC, tGCvar, 1))
+		return returnval
+def reachableGC(C_struct):
+	"""
+		Checks if a demanded GC target content is reachable in dependence with the given sequence constraint.
+	"""
+	AU = 0
+	for i in C_struct:
+		if i == "A" or i == "U":
+			AU += 1
+	maxGC = 1 - (AU / float(len(C_struct))) # 1 - min_GC
+	return maxGC
+def runColony(s, SC, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy):
+	"""
+		Execution function of a single ant colony finding one solution sequence
+	"""
+	retString = ""
+	retString2 = []
+	BPstack, LP = getBPStack(s, SC)
+	rGC = reachableGC(SC)
+	GC_message = ""
+	if GC > rGC:
+		print >> sys.stderr, "WARNING: Chosen target GC %s content is not reachable due to sequence constraint! Sequence Constraint GC-content is: %s" % (GC, rGC)
+		GC = rGC
+	# Initial Constraint Checks prior to execution
+	STR = isValidStructure(s)
+	START_SC , SC = checkSequenceConstraint(str(SC))
+	START_LENGTH = checkSimilarLength(str(s), str(SC))
+	START_constraint_compatibility , CompReport = checkConstaintCompatibility(BPstack, SC, IUPAC_compatibles)
+	g2g = good2Go(START_SC, START_LENGTH, START_constraint_compatibility, STR)
+	if (g2g == 1):
+		start_time = time.time()
+		max_time = 600 # seconds
+		####
+		# INITIALIZATION OF THE RNA TOOLs
+		#
+		RNAfold = init_RNAfold(213, temperature, paramFile)
+		#RNAdistance = init_RNAdistance()
+		RNAfold_pattern = re.compile('.+\n([.()]+)\s.+')
+		#RNAdist_pattern = re.compile('.*\s([\d]+)')
+		#
+		####
+		terrain = initTerrain(s)
+		#print len(terrain),
+		terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
+		#print len(terrain[0])
+		#printTerrain(terrain)
+		#exit(0)
+		global_ant_count = 0
+		global_best_ants = 0
+		criterion = False
+		met = True
+		ant_no = 1
+		prev_res = 0
+		seq = ""
+		counter = 0
+		dstruct_log = []
+		dGC_log = []
+		distance_structural = 1000
+		distance_GC = 1000
+		distance_seq = 1000
+		convergence = convergence_count
+		convergence_counter = 0
+		resets = 0
+		path = ""
+		curr_structure = ""
+		Dscore = 100000
+		distance_structural = 10000
+		distance_GC = 10000
+		distance_seq = 10000
+		best_solution = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
+		best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
+		best_solution_since = 0
+		ants_per_selection = 10
+		if len(LP) > 0 :
+			for lp in LP:
+				s = substr(lp + 1, s, ".")
+				s = substr(LP[lp] + 1, s, ".")
+		init = 1
+		while criterion != met and getUsedTime(start_time) < max_time:
+			iteration_start = time.time()
+			global_ant_count += 1
+			global_best_ants += 1
+			path_info = getPathFromSelection(ants_per_selection, s, terrain, alpha, beta, RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy)
+			distance_structural_prev = distance_structural
+			distance_GC_prev = distance_GC
+			distance_seq_prev = distance_seq
+			path, Dscore , distance_structural, distance_GC, distance_seq = path_info
+			curr_structure = ""
+			if pseudoknots:
+				curr_structure = getPKStructure(path, strategy)
+			else:
+				curr_structure = getRNAfoldStructure(path, RNAfold)
+			curr_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
+			# BEST SOLUTION PICKING
+			if improve == "h": # hierarchical check
+				# for the global best solution
+				if distance_structural < best_solution[3] or (distance_structural == best_solution[3] and distance_GC < best_solution[4]):
+					best_solution = curr_solution
+					ant_no = 1
+				# for the local (reset) best solution
+				if distance_structural < best_solution_local[3] or (distance_structural == best_solution_local[3] and distance_GC < best_solution_local[4]):
+					best_solution_local = curr_solution
+			elif improve == "s": #score based check
+				# store best global solution
+				if Dscore < best_solution[2]:
+					best_solution = curr_solution
+					ant_no = 1
+				# store best local solution for this reset
+				if Dscore < best_solution_local[2]:
+					best_solution_local = curr_solution
+# OLD ' BEST SOLUTION ' PICKING
+#			if Dscore < best_solution[2]:
+#				best_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
+#
+#			if Dscore < best_solution_local[2]:
+#				best_solution_local = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
+			distance_DN = 0
+			if verbose:
+				print "SCORE " + str(Dscore) + " Resets " + str(resets) + " #Ant " + str(global_ant_count) + " out of " + str(ants_per_selection)  + " cc " + str(convergence_counter)
+				print s, " <- target struct"
+				print best_solution[0] , " <- BS since ", str(best_solution_since), "Size of Terrrain:", len(terrain[0])
+				print best_solution[1] , " <- BS Dscore " + str(best_solution[2]) + " ds " + str(best_solution[3]) + " dGC " + str(best_solution[4]) + " dseq " + str(best_solution[5])+ " LP " + str(len(LP)) + " <- best solution stats"
+				print curr_structure, " <- CS"
+				print path,
+				print " <- CS", "Dscore", str(Dscore), "ds", distance_structural, "dGC", distance_GC, "GC", getGC(path)*100, "Dseq", distance_seq
+			#### UPDATING THE TERRAIN ACCORDING TO THE QUALITY OF THE CURRENT BESTO-OUT-OF-k SOLUTION
+			updateTerrain(path, curr_structure, s, distance_structural,distance_GC, distance_seq, distance_DN, terrain, evaporation_rate, correction_terms, BPstack, verbose, global_ant_count)
+			####
+			if verbose: print "Used time for one iteration", time.time() - iteration_start
+			# CONVERGENCE AND TERMINATION CRITERION MANAGEMENT
+			#print distance_structural, distance_GC, best_solution_local[3], best_solution_local[4]
+			if inConvergenceCorridor(curr_solution[3], curr_solution[4], best_solution_local[3], best_solution_local[4]):
+				convergence_counter += 1
+			if distance_structural_prev == distance_structural and distance_GC_prev == distance_GC:
+				convergence_counter += 1
+			if best_solution[3] == objective_to_target_distance:
+				if best_solution[4] == 0.0:
+					break
+				ant_no = ant_no + 1
+				convergence_counter -= 1
+			else:
+				ant_no = 1
+			if ant_no == termination_convergence or resets >= reset_limit or global_ant_count >= 100000 or best_solution_since == 5:
+				break
+			# RESET
+			if ant_no < termination_convergence and convergence_counter >= convergence:
+				terrain = initTerrain(s)
+				terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
+				criterion = False
+				met = True
+				ant_no = 1
+				prev_res = 0
+				pre_path = "_" * len(s)
+				path = ""
+				curr_structure = ""
+				counter = 0
+				Dscore = 100000
+				distance_structural = 1000
+				distance_GC = 1000
+				distance_seq = 1000
+				best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
+				convergence = convergence_count
+				convergence_counter = 0
+				if resets == 0:
+					sentinel_solution = best_solution
+					best_solution_since += 1
+				else:
+					if best_solution[2] < sentinel_solution[2]:
+						sentinel_solution = best_solution
+						best_solution_since = 0
+					else:
+						best_solution_since += 1
+				resets += 1
+		duration  = getUsedTime(start_time)
+		retString += "|Ants:" + str(global_ant_count)
+		retString += "|Resets:" + str(resets) + "/" + str(reset_limit)
+		retString += "|AntsTC:" + str(termination_convergence)
+		retString += "|CC:" + str(convergence_count)
+		retString += "|IP:" + str(improve)
+		retString += "|BSS:" + str(best_solution_since)
+		#if GC_message != "":
+		#	retString += GC_message + "\n"
+		sequence = best_solution[0]
+		struct = best_solution[1]
+		retString += "|LP:" + str(len(LP))
+		retString += "|ds:" + str(getStructuralDistance(s,SC, sequence, RNAfold, verbose, LP, BPstack, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy))
+		retString += "|dGC:" + str(best_solution[4])
+		retString += "|GC:" + str(getGC(sequence)*100)
+		retString += "|dseq:" + str(getSequenceEditDistance(SC, sequence))
+		retString += "|L:" + str(len(sequence))
+		retString += "|Time:" + str(duration)
+		retString2.append(struct)
+		retString2.append(sequence)
+		# CLOSING THE PIPES TO THE PROGRAMS
+		RNAfold.communicate()
+		#RNAdistance.communicate()
+	else: # Structural premisses are not met, htherefore the program will halt with a failure message
+		retString +=  "\nSome mistake detected\n"
+		retString +=  "SequenceConstraintCheck: " + str(START_SC) + "\nSequenceConstraint: " +  str(SC) + "\nLengthCheck: " + str(START_LENGTH) + "\nConstraintCompatibility: " + str(START_constraint_compatibility)+ "\n" + CompReport + "\n"
+	return (retString, retString2)
+def findSequence(structure, Cseq, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots, strategy, useGU, return_mod = False):
+	"""
+		MAIN antaRNA - ant assembled RNA
+	"""
+	if seed != "none":
+		random.seed(seed)
+	if Cseq == "":
+		sequenceconstraint = "N" * len(structure)
+	else:
+		sequenceconstraint = str(Cseq)
+	alpha = float(alpha)
+	beta = float(beta)
+	tGC = float(tGC)
+	evaporation_rate = float(evaporation_rate)
+	struct_correction_term = float(struct_correction_term)
+	GC_correction_term = float(GC_correction_term)
+	seq_correction_term = float(seq_correction_term)
+	colonies = int(colonies)
+	file_id = str(file_id)
+	tmp_verbose = verbose
+	tmp_output_verbose = output_verbose
+	verbose = tmp_output_verbose # Due to later change, this is a twistaround and a switching of purpose
+	output_verbose = tmp_verbose # Due to later change, this is a twistaround and a switching of purpose
+	correction_terms = struct_correction_term, GC_correction_term, seq_correction_term
+	temperature = float(temperature)
+	print_to_STDOUT = (file_id == "STDOUT")
+	useGU = useGU
+	if return_mod == False:
+		if print_to_STDOUT == False:
+			outfolder = '/'.join(file_id.strip().split("/")[:-1])
+			curr_dir = os.getcwd()
+			if not os.path.exists(outfolder):
+				os.makedirs(outfolder)
+			os.chdir(outfolder)
+	sequenceconstraint = transform(sequenceconstraint)
+	###############################################
+	# Allowed deviation from the structural target:
+	objective_to_target_distance = 0.0
+	# Loading the IUPAC copatibilities of nuleotides and their abstract representing symbols
+	IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"}
+	IUPAC_compatibles = loadIUPACcompatibilities(IUPAC, useGU)
+	IUPAC_reverseComplements = {}
+	if useGU == False: ## Without the GU basepair
+		IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"C", "U":"A", "R":"UC", "Y":"AG", "S":"GC", "W":"UA","K":"CA", "M":"UG", "B":"AGC", "D":"ACU", "H":"UGA", "V":"UGC", "N":"ACGU"}
+	else: ## allowing the GU basepair
+		IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"UC", "U":"AG", "R":"UC", "Y":"AG", "S":"UGC", "W":"UAG","K":"UCAG", "M":"UG", "B":"AGCU", "D":"AGCU", "H":"UGA", "V":"UGC", "N":"ACGU"}
+	result = []
+	for col in xrange(colonies):
+		# Checking the kind of taget GC value should be used
+		GC = getGCSamplingValue(tGC, tGCmax, tGCvar)
+		# Actual execution of a ant colony procesdure
+		output_v, output_w  =  runColony(structure, sequenceconstraint, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy)
+		# Post-Processing the output of a ant colony procedure
+		line = ">" + name + str(col)
+		if output_verbose:
+			line += "|Cstr:" + structure + "|Cseq:" + sequenceconstraint + "|Alpha:" + str(alpha) + "|Beta:" + str(beta) + "|tGC:" +  str(GC)  + "|ER:" + str(evaporation_rate) + "|Struct_CT:" + str(struct_correction_term) + "|GC_CT:" + str(GC_correction_term) + "|Seq_CT:" + str(seq_correction_term) + output_v + "\n" + "\n".join(output_w)
+		else:
+			line += "\n" + output_w[1]
+		if return_mod == False:
+			if print_to_STDOUT:
+				print line
+			else:
+				if col == 0:
+					print2file(file_id, line, 'w')
+				else:
+					print2file(file_id, line, 'a')
+		else:
+			result.append(line)
+	if return_mod == True:
+		return result
+	if print_to_STDOUT == False:
+		os.chdir(curr_dir)
+def execute(args):
+	"""
+		CHECK AND PARSE THE COMMAND LINE STUFF
+	"""
+	# Checking the arguments, parsed from the shell
+	###############################################
+	name = args.name
+	structure = args.Cstr
+	if args.Cseq == "":
+		sequenceconstraint = "N" * len(structure)
+	else:
+		sequenceconstraint = args.Cseq
+	seed = args.seed
+	alpha = args.alpha
+	beta = args.beta
+	tGC = args.tGC
+	if tGC < 0 or tGC > 1:
+		print "Error: Chosen tGC not in range [0,1]"
+		exit(1)
+	evaporation_rate = args.ER
+	struct_correction_term = args.Cstrweight
+	GC_correction_term = args.Cgcweight
+	seq_correction_term = args.Cseqweight
+	colonies = args.noOfColonies
+	degreeOfSequenceInducement = args.level
+	file_id = args.output_file
+	verbose = args.verbose
+	output_verbose = args.output_verbose
+	tGCmax = args.tGCmax
+	tGCvar = args.tGCvar
+	termination_convergence = args.antsTerConv
+	convergence_count = args.ConvergenceCount
+	temperature = args.temperature
+	reset_limit = args.Resets
+	improve = args.improve_procedure
+	### RNAfold parameterfile
+	paramFile = args.paramFile
+	# Using the pkiss program under user changeable parameters
+	pseudoknots = args.pseudoknots
+	# Loading the optimized parameters for pseudoknots and ignore user input
+	if args.pseudoknot_parameters:
+		alpha = 1.0
+		beta = 0.1
+		evaporation_rate = 0.2
+		struct_correction_term = 0.1
+		GC_correction_term = 1.0
+		seq_correction_term = 0.5
+		termination_convergence = 50
+		convergence_count = 100
+	strategy = args.strategy
+	useGU = args.useGUBasePair
+	checkForViennaTools()
+	if pseudoknots:
+		checkForpKiss()
+	findSequence(structure, sequenceconstraint, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots,  strategy, useGU)
+def exe():
+	"""
+	MAIN EXECUTABLE WHICH PARSES THE INPUT LINE
+	"""
+	argument_parser = argparse.ArgumentParser(
+	description = """
+	#########################################################################
+	#       antaRNA - ant assembled RNA                                     #
+	#       -> Ant Colony Optimized RNA Sequence Design                     #
+	#       ------------------------------------------------------------    #
+	#       Robert Kleinkauf (c) 2015                                       #
+	#       Bioinformatics, Albert-Ludwigs University Freiburg, Germany     #
+	#########################################################################
+	- For antaRNA only the VIENNNA RNA Package must be installed on your linux system.
+	  antaRNA will only check, if the executables of RNAfold and RNAdistance of the ViennaRNA package can be found. If those programs are
+	  not installed correctly, no output will be generated, an also no warning will be prompted.
+	  So the binary path of the Vienna Tools must be set up correctly in your system's PATH variable in order to run antaRNA correctly!
+- antaRNA was only tested under Linux.
+- For questions and remarks please feel free to contact us at http://www.bioinf.uni-freiburg.de/
+	""",
+	epilog = """
+	Example calls:
+		python antaRNA.py --Cstr "...(((...)))..." --tGC 0.5 -n 2
+		python antaRNA.py --Cstr ".........AAA(((...)))AAA........." --tGC 0.5 -n 10 --output_file /path/to/antaRNA_TESTRUN -ov
+		python antaRNA.py --Cstr "BBBBB....AAA(((...)))AAA....BBBBB" --Cseq "NNNNANNNNNCNNNNNNNNNNNGNNNNNNUNNN" --tGC 0.5 -n 10
+	#########################################################################
+	#       --- Hail to the Queen!!! All power to the swarm!!! ---          #
+	#########################################################################
+		""",
+	#formatter_class=RawTextHelpFormatter
+	)
+	# mandatorys
+	argument_parser.add_argument("-Cstr", "--Cstr", help="Structure constraint using RNA dotbracket notation with fuzzy block constraint. \n(TYPE: %(type)s)\n\n", type=str, required=True)
+	argument_parser.add_argument("-tGC", "--tGC", help="Objective target GC content in [0,1].\n(TYPE: %(type)s)\n\n", type=float, required=True)
+	argument_parser.add_argument("-n", "--noOfColonies", help="Number of sequences which shall be produced. \n(TYPE: %(type)s)\n\n\n\n", type=int,  default=1)
+	argument_parser.add_argument("-GU", "--useGUBasePair", help="Allowing GU base pairs. \n\n", action="store_true")
+	argument_parser.add_argument("-s", "--seed", help = "Provides a seed value for the used pseudo random number generator.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="none")
+	argument_parser.add_argument("-ip", "--improve_procedure", help = "Select the improving method.  h=hierarchical, s=score_based.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="s")
+	argument_parser.add_argument("-r", "--Resets", help = "Amount of maximal terrain resets, until the best solution is retuned as solution.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=5)
+	argument_parser.add_argument("-CC", "--ConvergenceCount", help = "Delimits the convergence count criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=130)
+	argument_parser.add_argument("-aTC", "--antsTerConv", help = "Delimits the amount of internal ants for termination convergence criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=50)
+	argument_parser.add_argument("-p", "--pseudoknots", help = "Switch to pseudoknot based prediction using pKiss. Check the pseudoknot parameter usage!!!\n\n", action="store_true")
+	argument_parser.add_argument("-pkPar", "--pseudoknot_parameters", help = "Enable optimized parameters for the usage of pseudo knots (Further parameter input ignored).\n\n", action="store_true")
+	argument_parser.add_argument("--strategy", help = "Defining the pKiss folding strategy.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="A")
+	# Mutual Exclusiv target GC distribution variables
+	#tGCgroup = argument_parser.add_mutually_exclusive_group()
+	argument_parser.add_argument("-tGCmax", "--tGCmax", help = "Provides a maximum tGC value [0,1] for the case of uniform distribution sampling. The regular tGC value serves as minimum value.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0)
+	argument_parser.add_argument("-tGCvar", "--tGCvar", help = "Provides a tGC variance (sigma square) for the case of normal distribution sampling. The regular tGC value serves as expectation value (mu).\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0)
+	argument_parser.add_argument("-t", "--temperature", help = "Provides a temperature for the folding algorithms.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=37.0)
+	argument_parser.add_argument("-P", "--paramFile", help = "Changes the energy parameterfile of RNAfold. If using this explicitly, please provide a suitable energy file delivered by RNAfold. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="")
+	argument_parser.add_argument("-of","--output_file", help="Provide a path and an output file, e.g. \"/path/to/the/target_file\". \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="STDOUT")
+	argument_parser.add_argument("-Cseq", "--Cseq", help="Sequence constraint using RNA nucleotide alphabet {A,C,G,U} and wild-card \"N\". \n(TYPE: %(type)s)\n\n", type=str, default = "")
+	argument_parser.add_argument("-l", "--level", help="Sets the level of allowed influence of sequence constraint on the structure constraint [0:no influence; 3:extensive influence].\n(TYPE: %(type)s)\n\n", type=int, default = 1)
+	argument_parser.add_argument("--name", help="Defines a name which is used in the sequence output. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="antaRNA_")
+	argument_parser.add_argument("-a", "--alpha", help="Sets alpha, probability weight for terrain pheromone influence. [0,1] \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0)
+	argument_parser.add_argument("-b", "--beta", help="Sets beta, probability weight for terrain path influence. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0)
+	argument_parser.add_argument("-er", "--ER", help="Pheromone evaporation rate. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.2)
+	argument_parser.add_argument("-Cstrw", "--Cstrweight", help="Structure constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.5)
+	argument_parser.add_argument("-Cgcw", "--Cgcweight", help="GC content constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=5.0)
+	argument_parser.add_argument("-Cseqw", "--Cseqweight", help="Sequence constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n\n", type=float, default=1.0)
+	argument_parser.add_argument("-ov", "--output_verbose", help="Displayes intermediate output.\n\n", action="store_true")
+	argument_parser.add_argument("-v", "--verbose", help="Prints additional features and stats to the headers of the produced sequences. Also adds the structure of the sequence.\n\n", action="store_true")
+	args = argument_parser.parse_args()
+	execute(args)
+def checkForViennaTools():
+	"""
+	Checking for the presence of the Vienna tools in the system by which'ing for RNAfold and RNAdistance
+	"""
+	RNAfold_output = subprocess.Popen(["which", "RNAfold"], stdout=subprocess.PIPE).communicate()[0].strip()
+	if len(RNAfold_output) > 0 and RNAfold_output.find("found") == -1 and RNAfold_output.find(" no ") == -1:
+		return True
+	else:
+		print "It seems the Vienna RNA Package is not installed on your machine. Please do so!"
+		print "You can get it at http://www.tbi.univie.ac.at/"
+		exit(0)
+def checkForpKiss():
+	"""
+		Checking for the presence of pKiss
+	"""
+	pKiss_output = subprocess.Popen(["which", "pKiss"], stdout=subprocess.PIPE).communicate()[0].strip()
+	if len(pKiss_output) > 0 and pKiss_output.find("found") == -1 and pKiss_output.find(" no ") == -1:
+		return True
+	else:
+		print "It seems that pKiss is not installed on your machine. Please do so!"
+		print "You can get it at http://bibiserv2.cebitec.uni-bielefeld.de/pkiss"
+		exit(0)
+if __name__ == "__main__":
+	exe()

Mercurial > repos > rnateam > antarna

comparison antaRNA.py @ 0:fcf4719d3831 draft