Mercurial > repos > rnateam > antarna
comparison antaRNA.py @ 0:fcf4719d3831 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/antarna/ commit 71414cf7f040d610afc3f02be31446efc3a82a40-dirty
| author | rnateam |
|---|---|
| date | Wed, 13 May 2015 11:02:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:fcf4719d3831 |
|---|---|
| 1 import numpy | |
| 2 import sys | |
| 3 import random | |
| 4 import subprocess | |
| 5 import re | |
| 6 import decimal | |
| 7 import math | |
| 8 import os | |
| 9 import shutil | |
| 10 import time | |
| 11 import types | |
| 12 import argparse | |
| 13 #from argparse import RawTextHelpFormatter | |
| 14 | |
| 15 ############################# | |
| 16 # FUNCTIONS | |
| 17 | |
| 18 def print2file(f, i, m): | |
| 19 """ | |
| 20 print content i to file f in mode m | |
| 21 """ | |
| 22 line = str(i) | |
| 23 if m == "a": | |
| 24 call = "echo \"" + line + "\" >> " + f | |
| 25 elif m == "w": | |
| 26 call = "echo \"" + line + "\" > " + f | |
| 27 os.system(call) | |
| 28 | |
| 29 # checking and correcting the alphabet of the constraint sequence | |
| 30 def checkSequenceConstraint(SC): | |
| 31 """ | |
| 32 Checks the Sequence constraint for illegal nucleotide characters | |
| 33 """ | |
| 34 out = "" | |
| 35 for c in SC: | |
| 36 c = c.upper() | |
| 37 if c not in "ACGURYSWKMBDHVN": | |
| 38 # and c!= "R" and c != "Y" and c != "S" and c != "W" and c != "K" and c != "M" and c != "B" and c != "D" and c != "H" and c != "V": | |
| 39 if c == "T": | |
| 40 c = "U" | |
| 41 else: | |
| 42 print "\tIllegal Character in the constraint sequence!" | |
| 43 print "\tPlease use the IUPAC nomenclature for defining nucleotides in the constraint sequence!" | |
| 44 print "\tA Adenine" | |
| 45 print "\tC Cytosine" | |
| 46 print "\tG Guanine" | |
| 47 print "\tT/U Thymine/Uracil" | |
| 48 print "\tR A or G" | |
| 49 print "\tY C or T/U" | |
| 50 print "\tS G or C" | |
| 51 print "\tW A or T/U" | |
| 52 print "\tK G or T/U" | |
| 53 print "\tM A or C" | |
| 54 print "\tB C or G or T/U" | |
| 55 print "\tD A or G or T/U" | |
| 56 print "\tH A or C or T/U" | |
| 57 print "\tV A or C or G" | |
| 58 print "\tN any base" | |
| 59 exit(0) | |
| 60 out += c | |
| 61 return (1, out) | |
| 62 | |
| 63 | |
| 64 def transform(seq): | |
| 65 """ | |
| 66 Transforms "U" to "T" for the processing is done on DNA alphabet | |
| 67 """ | |
| 68 S = "" | |
| 69 for s in seq: | |
| 70 if s == "T": | |
| 71 S += "U" | |
| 72 else: | |
| 73 S += s | |
| 74 return S | |
| 75 | |
| 76 | |
| 77 def checkSimilarLength(s, SC): | |
| 78 """ | |
| 79 Compares sequence and structure constraint length | |
| 80 """ | |
| 81 if len(s) == len(SC): | |
| 82 return 1 | |
| 83 else: | |
| 84 return 0 | |
| 85 | |
| 86 | |
| 87 def isStructure(s): | |
| 88 """ | |
| 89 Checks if the structure constraint only contains "(", ")", and "." and legal fuzzy structure constraint characters. | |
| 90 """ | |
| 91 returnvalue = 1 | |
| 92 for a in range(0,len(s)): | |
| 93 if s[a] not in ".()[]{}<>": | |
| 94 if s[a] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
| 95 returnvalue = 0 | |
| 96 return returnvalue | |
| 97 | |
| 98 | |
| 99 def isBalanced(s): | |
| 100 """ | |
| 101 Check if the structure s is of a balanced nature | |
| 102 """ | |
| 103 | |
| 104 balance = 1 | |
| 105 for bracket in ["()", "[]", "{}", "<>"]: | |
| 106 counter = 0 | |
| 107 for a in xrange(len(s)): | |
| 108 if s[a] in bracket[0]: | |
| 109 counter += 1 | |
| 110 elif s[a] in bracket[1]: | |
| 111 counter -= 1 | |
| 112 if counter != 0: | |
| 113 balance = 0 | |
| 114 return balance | |
| 115 | |
| 116 | |
| 117 | |
| 118 def fulfillsHairpinRule(s): | |
| 119 """ | |
| 120 CHECKING FOR THE 3 nt LOOP INTERSPACE | |
| 121 for all kind of basepairs, even wihtin the pdeudoknots | |
| 122 """ | |
| 123 | |
| 124 fulfillsRules = 1 | |
| 125 for bracket in ["()", "[]", "{}", "<>"]: | |
| 126 last_opening_char = 0 | |
| 127 check = 0 | |
| 128 for a in xrange(len(s)): | |
| 129 if s[a] == bracket[0]: | |
| 130 last_opening_char = a | |
| 131 check = 1 | |
| 132 elif s[a] == bracket[1] and check == 1: | |
| 133 check = 0 | |
| 134 if a - last_opening_char < 4: | |
| 135 return 0 | |
| 136 return 1 | |
| 137 | |
| 138 | |
| 139 def isValidStructure(s): | |
| 140 """ | |
| 141 Checks, if the structure s is a valid structure | |
| 142 """ | |
| 143 | |
| 144 Structure = isStructure(s) | |
| 145 Balanced = isBalanced(s) | |
| 146 HairpinRule = fulfillsHairpinRule(s) | |
| 147 | |
| 148 if Structure == 1 and Balanced == 1 and HairpinRule == 1: | |
| 149 return 1 | |
| 150 else: | |
| 151 print Structure, Balanced, HairpinRule | |
| 152 return 0 | |
| 153 | |
| 154 def loadIUPACcompatibilities(IUPAC, useGU): | |
| 155 """ | |
| 156 Generating a hash containing all compatibilities of all IUPAC RNA NUCLEOTIDES | |
| 157 """ | |
| 158 compatible = {} | |
| 159 for nuc1 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE | |
| 160 sn1 = list(IUPAC[nuc1]) | |
| 161 for nuc2 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE | |
| 162 sn2 = list(IUPAC[nuc2]) | |
| 163 compatib = 0 | |
| 164 for c1 in sn1: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE: | |
| 165 for c2 in sn2: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE: | |
| 166 # CHECKING THEIR COMPATIBILITY | |
| 167 if useGU == True: | |
| 168 if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C") or (c1 == "G" and c2 == "U") or (c1 == "U" and c2 == "G"): | |
| 169 compatib = 1 | |
| 170 else: | |
| 171 if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C"): | |
| 172 compatib = 1 | |
| 173 compatible[nuc1 + "_" + nuc2] = compatib # SAVING THE RESPECTIVE GROUP COMPATIBILITY, REVERSE SAVING IS NOT REQUIRED, SINCE ITERATING OVER ALL AGAINST ALL | |
| 174 return compatible | |
| 175 | |
| 176 def isCompatibleToSet(c1, c2, IUPAC_compatibles): | |
| 177 """ | |
| 178 Checks compatibility of c1 wihtin c2 | |
| 179 """ | |
| 180 compatible = True | |
| 181 for setmember in c2: | |
| 182 #print setmember | |
| 183 if isCompatible(c1, setmember, IUPAC_compatibles) == False: | |
| 184 return False | |
| 185 return compatible | |
| 186 | |
| 187 | |
| 188 def isCompatible(c1, c2, IUPAC_compatibles): | |
| 189 """ | |
| 190 Checks compatibility between character c1 and c2 | |
| 191 """ | |
| 192 if IUPAC_compatibles[c1 + "_" + c2] == 1: | |
| 193 return True | |
| 194 else: | |
| 195 return False | |
| 196 | |
| 197 | |
| 198 def isStructureCompatible(lp1, lp2 ,bp): | |
| 199 """ | |
| 200 Checks, if the region within lp1 and lp2 is structurally balanced | |
| 201 """ | |
| 202 x = lp1 + 1 | |
| 203 while (x < lp2): | |
| 204 if (bp[x] <= lp1 or bp[x] > lp2): | |
| 205 return False | |
| 206 if x == bp[x]: | |
| 207 x += 1 | |
| 208 else: | |
| 209 x = bp[x] + 1 | |
| 210 return x == lp2 | |
| 211 | |
| 212 | |
| 213 def checkConstaintCompatibility(basepairstack, sequenceconstraint, IUPAC_compatibles): | |
| 214 """ | |
| 215 Checks if the constraints are compatible to each other | |
| 216 """ | |
| 217 returnstring = "" | |
| 218 compatible = 1 | |
| 219 for id1 in basepairstack: # key = (constraint , (pos, constraint))) | |
| 220 constr1 = basepairstack[id1][0] | |
| 221 id2 = basepairstack[id1][1][0] | |
| 222 constr2 = basepairstack[id1][1][1] | |
| 223 | |
| 224 if id1 != id2 and not isCompatible(constr1, constr2, IUPAC_compatibles): | |
| 225 | |
| 226 compatible = 0 | |
| 227 returnstring += "nucleotide constraint " + str(constr1) + " at position " + str(id1) + " is not compatible with nucleotide constraint " + str(constr2) + " at position " + str(id2) + "\n" | |
| 228 #if not isCompatible(basepairstack[basepair][0], basepairstack[basepair][1][1]): | |
| 229 | |
| 230 #compatible = 0 | |
| 231 #else: | |
| 232 #returnstring += "nucleotide constraint " + str(basepairstack[basepair][0]) + " at position " + str(basepair) + " is compatible with nucleotide constraint " + str(basepairstack[basepair][1][1]) + " at position " + str(basepairstack[basepair][1][0]) + "\n" | |
| 233 return (compatible, returnstring) | |
| 234 | |
| 235 | |
| 236 def getLP(BPSTACK): | |
| 237 """ | |
| 238 Retreives valid lonley base pairs from a base pair stack | |
| 239 """ | |
| 240 #20 ('N', (>BLOCK<, 'N')) | |
| 241 | |
| 242 # geting single base pairs | |
| 243 stack = {} | |
| 244 LP = {} | |
| 245 if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType: | |
| 246 for i in BPSTACK.keys(): | |
| 247 #if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
| 248 stack[i] = int(BPSTACK[i][1][0]) | |
| 249 #print i , BPSTACK[i][1][0] | |
| 250 else: | |
| 251 for i in BPSTACK.keys(): | |
| 252 #if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
| 253 stack[i] = BPSTACK[i] | |
| 254 | |
| 255 # removing redundant base pair indices | |
| 256 for i in stack.keys(): | |
| 257 if i >= stack[i]: | |
| 258 del stack[i] | |
| 259 | |
| 260 # actual checking for single lonley base pairs | |
| 261 for i in stack.keys(): | |
| 262 if not (i-1 in stack and stack[i-1] == stack[i] + 1) and not (i+1 in stack and stack[i+1] == stack[i] - 1): | |
| 263 LP[i] = stack[i] | |
| 264 | |
| 265 ##actual removal of 2er lonley base pairs | |
| 266 for i in stack.keys(): | |
| 267 if not (i-1 in stack and stack[i-1] == stack[i] + 1) and (i+1 in stack and stack[i+1] == stack[i] - 1) and not (i+2 in stack and stack[i+2] == stack[i] - 2): | |
| 268 LP[i] = stack[i] | |
| 269 LP[i+1] = stack[i+1] | |
| 270 | |
| 271 | |
| 272 #if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType: | |
| 273 #for i in BPSTACK.keys(): | |
| 274 | |
| 275 ##if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
| 276 #stack[i] = int(BPSTACK[i][1][0]) | |
| 277 ##print i , BPSTACK[i][1][0] | |
| 278 #else: | |
| 279 #for i in BPSTACK.keys(): | |
| 280 ##if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
| 281 #stack[i] = BPSTACK[i] | |
| 282 | |
| 283 #for i in stack.keys(): | |
| 284 #if i >= stack[i]: | |
| 285 #del stack[i] | |
| 286 | |
| 287 | |
| 288 | |
| 289 return LP | |
| 290 | |
| 291 | |
| 292 def getBPStack(s, seq): | |
| 293 """ | |
| 294 Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq. | |
| 295 """ | |
| 296 tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]} | |
| 297 bpstack = {} | |
| 298 for i in xrange(len(s)): | |
| 299 | |
| 300 # REGULAR SECONDARY STRUCTURE DETECTION | |
| 301 if s[i] in "(){}[]<>": | |
| 302 | |
| 303 no = 0 | |
| 304 ### opening | |
| 305 if s[i] in "([{<": | |
| 306 if s[i] == "(": | |
| 307 tmp_stack["()"].append((i, seq[i])) | |
| 308 elif s[i] == "[": | |
| 309 tmp_stack["[]"].append((i, seq[i])) | |
| 310 elif s[i] == "{": | |
| 311 tmp_stack["{}"].append((i, seq[i])) | |
| 312 elif s[i] == "<": | |
| 313 tmp_stack["<>"].append((i, seq[i])) | |
| 314 | |
| 315 #closing | |
| 316 elif s[i] in ")]}>": | |
| 317 if s[i] == ")": | |
| 318 no, constr = tmp_stack["()"].pop() | |
| 319 elif s[i] == "]": | |
| 320 no, constr = tmp_stack["[]"].pop() | |
| 321 elif s[i] == "}": | |
| 322 no, constr = tmp_stack["{}"].pop() | |
| 323 elif s[i] == ">": | |
| 324 no, constr = tmp_stack["<>"].pop() | |
| 325 bpstack[no] = (constr, (i, seq[i])) | |
| 326 bpstack[i] = (seq[i] ,(no, constr)) | |
| 327 | |
| 328 elif s[i] == ".": | |
| 329 bpstack[i] = (seq[i], (i, seq[i])) | |
| 330 elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
| 331 bpstack[i] = (seq[i], (i, seq[i])) | |
| 332 | |
| 333 return (bpstack, getLP(bpstack)) | |
| 334 | |
| 335 | |
| 336 def getbpStack(s): | |
| 337 """ | |
| 338 Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq. | |
| 339 """ | |
| 340 tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]} | |
| 341 bpstack = {} | |
| 342 | |
| 343 for i in xrange(len(s)): | |
| 344 if s[i] in "(){}[]<>": | |
| 345 | |
| 346 no = 0 | |
| 347 ### opening | |
| 348 if s[i] in "([{<": | |
| 349 if s[i] == "(": | |
| 350 tmp_stack["()"].append(i) | |
| 351 elif s[i] == "[": | |
| 352 tmp_stack["[]"].append(i) | |
| 353 elif s[i] == "{": | |
| 354 tmp_stack["{}"].append(i) | |
| 355 elif s[i] == "<": | |
| 356 tmp_stack["<>"].append(i) | |
| 357 | |
| 358 #closing | |
| 359 elif s[i] in ")]}>": | |
| 360 if s[i] == ")": | |
| 361 no = tmp_stack["()"].pop() | |
| 362 elif s[i] == "]": | |
| 363 no = tmp_stack["[]"].pop() | |
| 364 elif s[i] == "}": | |
| 365 no = tmp_stack["{}"].pop() | |
| 366 elif s[i] == ">": | |
| 367 no = tmp_stack["<>"].pop() | |
| 368 bpstack[no] = i # save basepair in the format {opening base id (opening seq constr,(closing base id, closing seq constr))} | |
| 369 bpstack[i] = no # save basepair in the format {closing base id (closing seq constr,(opening base id, opening seq constr))} | |
| 370 | |
| 371 elif s[i] == ".": # no structural constaint given: produce entry, which references itself as a base pair partner.... | |
| 372 bpstack[i] = i | |
| 373 | |
| 374 elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
| 375 bpstack[i] = i | |
| 376 | |
| 377 #elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
| 378 ## per position, assigned to a certain block, the target nucleotide, with whcih it should interact is marked with the specific | |
| 379 ## block character | |
| 380 #bpstack[i] = s[i] | |
| 381 | |
| 382 return (bpstack, getLP(bpstack)) | |
| 383 | |
| 384 def maprange( a, b, s): | |
| 385 """ | |
| 386 Mapping function | |
| 387 """ | |
| 388 (a1, a2), (b1, b2) = a, b | |
| 389 return b1 + ((s - a1) * (b2 - b1) / (a2 - a1)) | |
| 390 | |
| 391 | |
| 392 def applyGCcontributionPathAdjustment(pathlength, tmpGC, nt): | |
| 393 """ | |
| 394 GC path length contribution calculation. | |
| 395 """ | |
| 396 GCadjustment = 1.5 | |
| 397 minimum = 0.5 | |
| 398 upper = GCadjustment | |
| 399 lower = minimum | |
| 400 | |
| 401 if nt == "A" or nt == "U": | |
| 402 pathlength = pathlength * maprange( (0, 1) , (lower, upper), tmpGC) | |
| 403 | |
| 404 if nt == "G" or nt == "C": | |
| 405 #pathlength = pathlength * (float(1-tmpGC)) | |
| 406 pathlength = pathlength * maprange( (1, 0) , (lower, upper), tmpGC) | |
| 407 return pathlength | |
| 408 | |
| 409 def getConstraint(TE, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements): | |
| 410 """ | |
| 411 Dependend on the situation in the constraint an the respective path section, setting wether a specific constraint can be given or not (for that path section) | |
| 412 """ | |
| 413 # TE :: transition element / path section under dispute | |
| 414 # id1 :: id of the position of the caharacter to which the transition is leading to | |
| 415 # id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present | |
| 416 # val :: BPstack information of the specific position | |
| 417 # constr1 :: constraining character of pos id1 | |
| 418 # constr2 :: constraining character of pos id2 | |
| 419 | |
| 420 id1 = int(TE.split(".")[0]) | |
| 421 val = BPstack[id1] # check out the value of the destination character in the basepair/constraint stack | |
| 422 constr1 = val[0] # getting the constraint character of position id1 | |
| 423 id2 = int(val[1][0]) # getting position id2 | |
| 424 constr2 = val[1][1] # getting the sequence constraint for position id2 | |
| 425 targetNucleotide = TE.split(".")[1][-1:] # where the edge is leading to | |
| 426 | |
| 427 c1 = set(IUPAC[constr1]) # getting all explicit symbols of c1 | |
| 428 c2 = set(IUPAC_reverseComplements[constr2]) # getting the reverse complement explicit symbols of c2 | |
| 429 | |
| 430 if targetNucleotide in c1: | |
| 431 if id1 == id2: | |
| 432 return 1 | |
| 433 else: | |
| 434 if targetNucleotide in c2: | |
| 435 return 1 | |
| 436 else: | |
| 437 return 0 | |
| 438 else: | |
| 439 return 0 | |
| 440 | |
| 441 """ | |
| 442 def getConstraint(TE, BPstack): | |
| 443 # TE :: transition element / path section | |
| 444 # id1 :: id of the position of the caharacter to which the transition is leading to | |
| 445 # id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present | |
| 446 # val :: BPstack information of the specific position | |
| 447 # constr1 :: constraining character of pos id1 | |
| 448 # constr2 :: constraining character of pos id2 | |
| 449 | |
| 450 ### BPstack [id1] = (constr1, (id2, constr2)) | |
| 451 | |
| 452 id1 = TE.split(".")[0] | |
| 453 #print id1 | |
| 454 #id1 = TE.find(TE.strip("_")) # strip the path section and getting the position of the section | |
| 455 #if len(TE.strip("_")) == 2: # check if the path section is from an internal and not an initial transition | |
| 456 #id1 += 1 # increase position id1 by 1, since the last character of the section is the destination character | |
| 457 val = BPstack[int(id1)] # check out the value of the destination character in the basepair/constraint stack | |
| 458 constr1 = val[0] # getting the constraint character of position id1 | |
| 459 id2 = val[1][0] # getting position id2 | |
| 460 constr2 = val[1][1] # getting the sequence constraint for position id2 | |
| 461 #print TE, id1, constr1, id2, constr2, | |
| 462 | |
| 463 #TE.split(".")[1][-1:] | |
| 464 if id1 == id2: # both ids were the same with either character, sequential or no sequential constraint -> no basepair constraint | |
| 465 if constr1 == TE.split(".")[1][-1:] and constr2 == TE.split(".")[1][-1:]: # case if the single base constraints on position id1 == id2 are the same as the destination character on id1 | |
| 466 #print 1 | |
| 467 return 1 | |
| 468 elif constr1 == constr2 == "N": # case if the single base constraints on position id1 == id2 has no constraint | |
| 469 #print 1 | |
| 470 return 1 | |
| 471 else: # single base sequence constraints differ | |
| 472 #print 0 | |
| 473 return 0 | |
| 474 | |
| 475 elif id1 != id2: # showing differentq ids, indicating a bp, (basepair structural constraint) | |
| 476 if constr1 == "N" and constr2 == "N": # no sequence constraint | |
| 477 #print 1 | |
| 478 return 1 | |
| 479 if constr1 == "N" and constr2 != "N": # c1 has no constraint, c2 has character constraint (sequence constraint of closing bases) | |
| 480 if TE.split(".")[1][-1:] == complementBase(constr2): # the current path section destination base is equal to the complement base of the mentioned sequence constraint in constr2 | |
| 481 #print 1 | |
| 482 return 1 | |
| 483 else: # case if the current path section destination base is not equeal to the mentioned complement sequence constraint in constr2 | |
| 484 #print 0 | |
| 485 return 0 | |
| 486 if constr1 != "N" and constr2 == "N": # c1 has character constraint, c2 has no character constraint (sequence constraint in the opening base) | |
| 487 if TE.split(".")[1][-1:] == constr1: # the current path section destination base is as constrained with constr1 | |
| 488 #print 1 | |
| 489 return 1 | |
| 490 else: # the current path section destination base is not as constrained in constr1 | |
| 491 #print 0 | |
| 492 return 0 | |
| 493 if constr1 != "N" and constr2 != "N": # both positions have sequential constraint | |
| 494 if TE.split(".")[1][-1:] == constr1: | |
| 495 #print 1 | |
| 496 return 1 | |
| 497 else: | |
| 498 #print 0 | |
| 499 return 0 | |
| 500 """ | |
| 501 | |
| 502 def applyTerrainModification(terrain, s, tmpGC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements): | |
| 503 #nucleotides = {'A': 0, 'C': 1,'G': 2,'T': 3} | |
| 504 | |
| 505 dels = [] | |
| 506 for terrainelement in sorted(terrain): | |
| 507 pheromone, pathlength = terrain[terrainelement] | |
| 508 pheromone = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements) | |
| 509 pathlength = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements) | |
| 510 pathlength = applyGCcontributionPathAdjustment(pathlength, tmpGC,terrainelement.split(".")[1][-1:]) | |
| 511 if pheromone * pathlength == 0: dels.append(terrainelement) | |
| 512 terrain[terrainelement] = (pheromone, pathlength,[]) | |
| 513 further_dels = {} | |
| 514 for terrainelement in sorted(dels): | |
| 515 pos, nucs = terrainelement.split(".") | |
| 516 if int(pos) < len(s)-1: | |
| 517 to_nt = nucs[-1:] | |
| 518 successor_pos = int(pos) + 1 | |
| 519 for i in ["A", "C", "G", "U"]: | |
| 520 del_element = str(successor_pos) + "." + to_nt + i | |
| 521 further_dels[del_element] = 1 | |
| 522 further_dels[terrainelement] = 1 | |
| 523 # deleting the inbound and outbound edges, which are forbidden | |
| 524 for terrainelement in further_dels: | |
| 525 del terrain[terrainelement] | |
| 526 # allocate the appropriate children of edges | |
| 527 for terrainelement in terrain: | |
| 528 pheromone, pathlength, children = terrain[terrainelement] | |
| 529 pos, nucs = terrainelement.split(".") | |
| 530 if int(pos) < len(s): | |
| 531 to_nt = nucs[-1:] | |
| 532 successor_pos = int(pos) + 1 | |
| 533 for i in ["A", "C", "G", "U"]: | |
| 534 if str(successor_pos) + "." + to_nt + i in terrain: | |
| 535 children.append(str(successor_pos) + "." + to_nt + i) | |
| 536 terrain[terrainelement] = (pheromone, pathlength,children) | |
| 537 starts = [] | |
| 538 for i in ["A", "C", "G", "U"]: | |
| 539 if str(0) + "." + i in terrain: | |
| 540 starts.append(str(0) + "." + i) | |
| 541 terrain["00.XY"] = (1, 1, starts) | |
| 542 return (terrain, BPstack) | |
| 543 | |
| 544 | |
| 545 def initTerrain(s): # THE CLASSIC | |
| 546 """ | |
| 547 Initialization of the terrain with graph like terrain... vertices are modeled implicitly | |
| 548 """ | |
| 549 nt = ["A","C","G","U"] | |
| 550 nt2 = ["AA","AC","AG","AU","CA","CC","CG","CU","GA","GC","GG","GU","UA","UC","UG","UU"] # Allowed dinucleotides | |
| 551 e = {} | |
| 552 pathlength = 1 | |
| 553 pheromone = 1 | |
| 554 for p in xrange(len(s)): | |
| 555 if p == 0: | |
| 556 for i in nt: | |
| 557 e["%s.%s"%(p,i)] = (pheromone, pathlength) | |
| 558 elif p > 0: | |
| 559 for n in nt2: | |
| 560 e["%s.%s"%(p,n)] = (pheromone, pathlength) | |
| 561 return e | |
| 562 | |
| 563 | |
| 564 | |
| 565 def complementBase(c): | |
| 566 """ | |
| 567 Returns the complement RNA character of c (without GU base pairs) | |
| 568 """ | |
| 569 retChar = "" | |
| 570 if c == "A" : | |
| 571 retChar = "U" | |
| 572 elif c == "U": | |
| 573 retChar = "A" | |
| 574 elif c == "C": | |
| 575 retChar = "G" | |
| 576 elif c == "G": | |
| 577 retChar = "C" | |
| 578 return retChar | |
| 579 | |
| 580 def printTerrain(terrain): | |
| 581 #print sorted(terrain.keys()) | |
| 582 tmp_i = "0" | |
| 583 tmp_c = 0 | |
| 584 terrain = terrain[0] | |
| 585 | |
| 586 for a, i in enumerate(sorted(terrain.keys())): | |
| 587 #print a | |
| 588 if i.split(".")[0] != tmp_i: | |
| 589 print "\nElements:", tmp_c,"\n#########################\n", i, terrain[i] | |
| 590 | |
| 591 tmp_c = 1 | |
| 592 tmp_i = i.split(".")[0] | |
| 593 else: | |
| 594 print i, terrain[i] | |
| 595 tmp_c += 1 | |
| 596 | |
| 597 print "\nElements:", tmp_c | |
| 598 print "#########################" | |
| 599 print len(terrain) | |
| 600 | |
| 601 def pickStep(tmp_steps, summe): | |
| 602 """ | |
| 603 Selects a step within the terrain | |
| 604 """ | |
| 605 if len(tmp_steps) == 1: | |
| 606 return tmp_steps[0][1] # returning the nucleotide of the only present step | |
| 607 else: | |
| 608 rand = random.random() # draw random number | |
| 609 mainval = 0 | |
| 610 for choice in xrange(len(tmp_steps)): | |
| 611 val, label = tmp_steps[choice] | |
| 612 mainval += val/float(summe) | |
| 613 if mainval > rand: # as soon, as the mainval gets larger than the random value the assignment is done | |
| 614 return label | |
| 615 | |
| 616 def getPath(s, tmp_terrain, tmp_BPstack, alpha, beta, IUPAC, IUPAC_reverseComplements): | |
| 617 """ | |
| 618 Performs a walk through the terrain and assembles a sequence, while respecting the structure constraint and IUPAC base complementarity | |
| 619 of the base pairs GU, GC and AT | |
| 620 """ | |
| 621 nt = ["A","C","G","U"] | |
| 622 prev_edge = "00.XY" | |
| 623 sequence = "" | |
| 624 while len(sequence) < len(s): | |
| 625 coming_from = sequence[-1:] | |
| 626 summe = 0 | |
| 627 steps = [] | |
| 628 i = len(sequence) | |
| 629 allowed_nt = "ACGU" | |
| 630 # base pair closing case check, with subsequent delivery of a reduced allowed nt set | |
| 631 | |
| 632 if i > tmp_BPstack[i][1][0]: | |
| 633 jump = tmp_BPstack[i][1][0] | |
| 634 nuc_at_jump = sequence[jump] | |
| 635 allowed_nt = IUPAC_reverseComplements[nuc_at_jump] | |
| 636 | |
| 637 #allowed_nt = complementBase(nuc_at_jump) | |
| 638 | |
| 639 # Checking for every possible nt if it is suitable for the selection procedure | |
| 640 for edge in tmp_terrain[prev_edge][-1]: | |
| 641 | |
| 642 if edge[-1:] in allowed_nt: | |
| 643 pheromone, PL , children = tmp_terrain[edge] | |
| 644 #if PL > 0: | |
| 645 value = ((float(pheromone * alpha)) + ((1/float(PL)) * beta)) | |
| 646 summe += value | |
| 647 steps.append((value, edge)) | |
| 648 prev_edge = pickStep(steps, summe) | |
| 649 sequence += prev_edge[-1:] | |
| 650 | |
| 651 return sequence | |
| 652 | |
| 653 | |
| 654 ### | |
| 655 # STRUCTURE PREDICTORS | |
| 656 ### | |
| 657 def getPKStructure(sequence, temperature, mode = "A"): | |
| 658 """ | |
| 659 Initialization pKiss mfe pseudoknot prediction | |
| 660 """ | |
| 661 p2p = "pKiss" | |
| 662 #p2p = "/usr/local/pkiss/2014-03-17/bin/pKiss_mfe" | |
| 663 strategy = "--strategy " | |
| 664 t = "--temperature " + str(temperature) | |
| 665 | |
| 666 if mode == "A": strategy += "A" | |
| 667 elif mode == "B": strategy += "B" | |
| 668 elif mode == "C": strategy += "C" | |
| 669 elif mode == "D": strategy += "D" | |
| 670 elif mode == "P": strategy += "P" | |
| 671 | |
| 672 p = subprocess.Popen( ([p2p, "--mode mfe", strategy, t]), | |
| 673 #shell = True, | |
| 674 stdin = subprocess.PIPE, | |
| 675 stdout = subprocess.PIPE, | |
| 676 stderr = subprocess.PIPE, | |
| 677 close_fds = True) | |
| 678 #print p.stderr.readline() | |
| 679 | |
| 680 p.stdin.write(sequence+'\n') | |
| 681 pks = p.communicate() | |
| 682 structure = "".join(pks[0].split("\n")[2].split(" ")[-1:]) | |
| 683 return structure | |
| 684 | |
| 685 def init_RNAfold(version, temperature, paramFile = ""): | |
| 686 """ | |
| 687 Initialization RNAfold listener | |
| 688 """ | |
| 689 p2p = "" | |
| 690 t = "-T " + str(temperature) | |
| 691 P = "" | |
| 692 if paramFile != "": | |
| 693 P = "-P " + paramFile | |
| 694 if version == 185: | |
| 695 p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAfold" | |
| 696 p = subprocess.Popen( ([p2p, '--noPS', '-d 2', t, P]), | |
| 697 shell = True, | |
| 698 stdin = subprocess.PIPE, | |
| 699 stdout = subprocess.PIPE, | |
| 700 stderr = subprocess.PIPE, | |
| 701 close_fds = True) | |
| 702 return p | |
| 703 elif version == 213: | |
| 704 p2p = "RNAfold" | |
| 705 p = subprocess.Popen( ([p2p, '--noPS', '-d 2', t, P]), | |
| 706 #shell = True, | |
| 707 stdin = subprocess.PIPE, | |
| 708 stdout = subprocess.PIPE, | |
| 709 stderr = subprocess.PIPE, | |
| 710 close_fds = True) | |
| 711 return p | |
| 712 else: | |
| 713 exit(0) | |
| 714 | |
| 715 def consult_RNAfold(seq, p): | |
| 716 """ | |
| 717 Consults RNAfold listener | |
| 718 """ | |
| 719 p.stdin.write(seq+'\n') | |
| 720 out = "" | |
| 721 for i in xrange(2): | |
| 722 out += p.stdout.readline() | |
| 723 return out | |
| 724 | |
| 725 | |
| 726 def getRNAfoldStructure(struct2, process1): | |
| 727 """ | |
| 728 Retrieves folded structure of a RNAfold call | |
| 729 """ | |
| 730 | |
| 731 RNAfold_pattern = re.compile('.+\n([.()]+)\s.+') | |
| 732 #RNAdist_pattern = re.compile('.*\s([\d]+)') | |
| 733 RNAfold_match = RNAfold_pattern.match(consult_RNAfold(struct2, process1)) | |
| 734 current_structure = "" | |
| 735 #if RNAfold_match: | |
| 736 return RNAfold_match.group(1) | |
| 737 | |
| 738 | |
| 739 def init_RNAdistance(): | |
| 740 """ | |
| 741 Initialization of RNAdistance listener | |
| 742 """ | |
| 743 #p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAdistance" | |
| 744 p2p = "RNAdistance" | |
| 745 p = subprocess.Popen( ([p2p]), | |
| 746 #shell = True, | |
| 747 stdin = subprocess.PIPE, | |
| 748 stdout = subprocess.PIPE, | |
| 749 stderr = subprocess.PIPE, | |
| 750 close_fds = True) | |
| 751 return p | |
| 752 | |
| 753 | |
| 754 def consult_RNAdistance(s1, s2, p): | |
| 755 """ | |
| 756 Consulting the RNAdistance listener | |
| 757 """ | |
| 758 p.stdin.write(s1+'\n') | |
| 759 p.stdin.write(s2+'\n') | |
| 760 out = "" | |
| 761 out_tmp = p.stdout.readline().strip() | |
| 762 if out_tmp != "": | |
| 763 out += out_tmp | |
| 764 return out | |
| 765 | |
| 766 def getInducingSequencePositions(Cseq, degreeOfSequenceInducement): | |
| 767 """ | |
| 768 Delimiting the degree of structure inducement by the supplied sequence constraint. | |
| 769 0 : no sequence induced structure constraint | |
| 770 1 : "ACGT" induce structure (explicit nucleotide structure inducement level) | |
| 771 2 : "MWKSYR" and "ACGT" (explicit and double instances) | |
| 772 3 : "BDHV" , "MWKSYR" and "ACGT" (explicit, double, and triple instances) | |
| 773 """ | |
| 774 setOfNucleotides = "" # resembling the "0"-case | |
| 775 if degreeOfSequenceInducement == 1: | |
| 776 setOfNucleotides = "ACGU" | |
| 777 elif degreeOfSequenceInducement == 2: | |
| 778 setOfNucleotides = "ACGUMWKSYR" | |
| 779 elif degreeOfSequenceInducement == 3: | |
| 780 setOfNucleotides = "ACGUMWKSYRBDHV" | |
| 781 #elif degreeOfSequenceInducement == 4: | |
| 782 #setOfNucleotides = "ACGTMWKSYRBDHVN" | |
| 783 | |
| 784 tmpSeq = "" | |
| 785 listset = setOfNucleotides | |
| 786 for pos in Cseq: | |
| 787 if pos not in listset: | |
| 788 tmpSeq += "N" | |
| 789 else: | |
| 790 tmpSeq += pos | |
| 791 | |
| 792 return setOfNucleotides, tmpSeq | |
| 793 | |
| 794 | |
| 795 def getBPDifferenceDistance(stack1, stack2): | |
| 796 """ | |
| 797 Based on the not identical amount of base pairs within both structure stacks | |
| 798 """ | |
| 799 d = 0 | |
| 800 for i in stack1.keys(): | |
| 801 # check base pairs in stack 1 | |
| 802 if i < stack1[i] and stack1[i] != stack2[i]: | |
| 803 d += 1 | |
| 804 # check base pairs in stack 2 | |
| 805 for i in stack2.keys(): | |
| 806 if i < stack2[i] and stack1[i] != stack2[i]: | |
| 807 d += 1 | |
| 808 return d | |
| 809 | |
| 810 | |
| 811 def getStructuralDistance(target_structure, Cseq, path, RNAfold, verbose, LP, BP, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy): | |
| 812 """ | |
| 813 Calculator for Structural Distance | |
| 814 """ | |
| 815 # fold the current solution's sequence to obtain the structure | |
| 816 | |
| 817 current_structure = "" | |
| 818 | |
| 819 if pseudoknots: | |
| 820 current_structure = getPKStructure(path,strategy) | |
| 821 else: | |
| 822 RNAfold_match = RNAfold_pattern.match(consult_RNAfold(path, RNAfold)) | |
| 823 current_structure = RNAfold_match.group(1) | |
| 824 | |
| 825 # generate the current structure's base-pair stack | |
| 826 bp = getbpStack(current_structure)[0] | |
| 827 # add case-dependend structural constraints in case of lonley basepairs formation | |
| 828 tmp_target_structure_bp = getbpStack(target_structure)[0] | |
| 829 | |
| 830 for lp in LP: | |
| 831 if bp[lp] == LP[lp]: # if the base pair is within the current solution structure, re-add the basepair into the constraint structure. | |
| 832 #tmp_target_structure[lp] = "(" | |
| 833 #tmp_target_structure[LP[lp]] = ")" | |
| 834 tmp_target_structure_bp[lp] = LP[lp] | |
| 835 tmp_target_structure_bp[LP[lp]] = lp | |
| 836 | |
| 837 # REMOVE BLOCK CONSTRAINT AND SUBSTITUTE IT WITH SINGLE STRAND INFORMATION repsective with brackets, if allowed base pairs occure | |
| 838 # check for all allowed implicit constraint block declarators | |
| 839 for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
| 840 occurances = [] | |
| 841 for m in re.finditer(c, target_structure): # search for a declarator in the requested structure | |
| 842 occurances.append(m.start()) # save the corresponding index | |
| 843 | |
| 844 # transform declarator into single stranded request | |
| 845 for i in occurances: | |
| 846 #tmp_target_structure[i] = "." | |
| 847 tmp_target_structure_bp[i] = i | |
| 848 # infer a base pair within the block declarated positions, if the current structure provides it. | |
| 849 for i in occurances: | |
| 850 for j in occurances: | |
| 851 if i < j: | |
| 852 if bp[i] == j: | |
| 853 #tmp_target_structure[i] = "(" | |
| 854 #tmp_target_structure[bp[i]] = ")" | |
| 855 | |
| 856 tmp_target_structure_bp[i] = bp[i] | |
| 857 tmp_target_structure_bp[bp[i]] = i | |
| 858 | |
| 859 # CHECK FOR SEQUENCE CONSTRAINT WHICH INDUCES STRUCTURE CONSTRAINT IN THE MOMENTARY SITUATION | |
| 860 #print "Checking Cseq influence and it's induced basepairs..." | |
| 861 IUPACinducers, tmp_Cseq = getInducingSequencePositions(Cseq, degreeOfSequenceInducement) | |
| 862 if len(Cseq.strip("N")) > 0: | |
| 863 #print "Processing Cseq influence" | |
| 864 # Iterate over all positions within the Base Pair stack | |
| 865 for i in BP: # Check for each base index i | |
| 866 | |
| 867 if i < bp[i]: # if the current index is samller that the affiliated in the basepair stack of the current solution | |
| 868 | |
| 869 bp_j = bp[i] # Actual j index of the current solution | |
| 870 BP_j = BP[i][1][0] # j index of the requested structure | |
| 871 if (i != bp_j and i == BP_j and BP[i][0] in IUPACinducers ): # if i pairs with some other base in the current structure, and i is requested single stranded and the Sequence constraint is allowed to induce... | |
| 872 if (BP[bp_j][1][0] == bp_j and BP[bp_j][0] in IUPACinducers):# If position j is requested singlestranded and position j nucleotide can induce base pairs | |
| 873 #if isCompatible(bp[i][0], bp[i][1][1], IUPAC_compatibles): # If both nucleotides, i and j are actually compatible | |
| 874 #tmp_target_structure[i] = "(" | |
| 875 #tmp_target_structure[bp_j] = ")" | |
| 876 | |
| 877 tmp_target_structure_bp[i] = bp[i] | |
| 878 tmp_target_structure_bp[bp_j] = i | |
| 879 | |
| 880 #tts = "".join(tmp_target_structure) | |
| 881 dsreg = getBPDifferenceDistance(tmp_target_structure_bp, bp) | |
| 882 | |
| 883 # CHECK FOR ALL DETERMINED LONELY BASE PAIRS (i<j), if they are formed | |
| 884 failLP = 0 | |
| 885 for lp in LP: | |
| 886 | |
| 887 if bp[lp] != LP[lp]: | |
| 888 | |
| 889 isComp = isCompatible(path[lp],path[LP[lp]], IUPAC_compatibles) | |
| 890 isStru = isStructureCompatible(lp, LP[lp] ,bp) | |
| 891 if not ( isStru and isStru ): # check if the bases at the specific positions are compatible and check if the | |
| 892 # basepair can be formed according to pseudoknot free restriction. If one fails, a penalty distance is raised for that base pair | |
| 893 failLP += 1 | |
| 894 | |
| 895 #print dsreg, failLP, float(len(tmp_target_structure_bp)) | |
| 896 dsLP = float(failLP) | |
| 897 | |
| 898 return (dsreg + dsLP) /float(len(tmp_target_structure_bp)) | |
| 899 | |
| 900 | |
| 901 def getGC(sequence): | |
| 902 """ | |
| 903 Calculate GC content of a sequence | |
| 904 """ | |
| 905 GC = 0 | |
| 906 for nt in sequence: | |
| 907 if nt == "G" or nt == "C": | |
| 908 GC = GC + 1 | |
| 909 GC = GC/float(len(sequence)) | |
| 910 return GC | |
| 911 | |
| 912 | |
| 913 def getGCDistance(tGC, gc2, L): | |
| 914 """ | |
| 915 Calculate the pseudo GC content distance | |
| 916 """ | |
| 917 nt_coeff = L * tGC | |
| 918 pc_nt = (1/float(L))*100 | |
| 919 # | |
| 920 d = gc2 - tGC | |
| 921 d = d * 100 | |
| 922 | |
| 923 f = math.floor(nt_coeff) | |
| 924 c = math.ceil(nt_coeff) | |
| 925 | |
| 926 if d < 0: # | |
| 927 #print "case x",(abs(nt_coeff - f)), pc_nt, (abs(nt_coeff - f)) * pc_nt, | |
| 928 d = d + (abs(nt_coeff - f)) * pc_nt | |
| 929 elif d > 0: # case y | |
| 930 #print "case y", abs(nt_coeff - c), pc_nt, abs(nt_coeff - c) * pc_nt, | |
| 931 d = d - abs(nt_coeff - c) * pc_nt | |
| 932 elif d == 0: | |
| 933 pass | |
| 934 | |
| 935 d = round(d, 7) | |
| 936 #d = max(0, abs(d)- ( max ( abs( math.ceil(nt_coeff)-(nt_coeff)) , abs(math.floor(nt_coeff)-(nt_coeff)) )/L)*100 ) | |
| 937 return abs(d) | |
| 938 | |
| 939 | |
| 940 def getSequenceEditDistance(SC, path): | |
| 941 """ | |
| 942 Calculate sequence edit distance of a solution to the constraint | |
| 943 """# | |
| 944 IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"} | |
| 945 edit = 0 | |
| 946 for i in xrange(len(SC)): | |
| 947 if path[i] not in IUPAC[SC[i]]: | |
| 948 edit += 1 | |
| 949 return edit/float(len(path)) | |
| 950 | |
| 951 | |
| 952 | |
| 953 def getTransitions(p): | |
| 954 """ | |
| 955 Retreive transitions of a specific path/sequence | |
| 956 """ | |
| 957 transitions = [] | |
| 958 for pos in xrange(len(p)): | |
| 959 if pos == 0: | |
| 960 transitions.append(str(pos) + "." + p[pos]) | |
| 961 | |
| 962 else: | |
| 963 insert = p[pos-1] + p[pos] | |
| 964 transitions.append(str(pos) + "." + insert) | |
| 965 | |
| 966 return transitions | |
| 967 | |
| 968 | |
| 969 def evaporate(t, er): | |
| 970 """ | |
| 971 Evaporate the terrain's pheromone trails | |
| 972 """ | |
| 973 terr, BP = t | |
| 974 c = 1 | |
| 975 for key in terr: | |
| 976 p,l,c = terr[key] | |
| 977 p *= (1-er) | |
| 978 terr[key] = (p, l, c) | |
| 979 | |
| 980 | |
| 981 def updateValue(distance, correction_term, omega): | |
| 982 """ | |
| 983 Retrieves a distance dependend pheromone value | |
| 984 """ | |
| 985 if correction_term == 0: | |
| 986 return 0 | |
| 987 else: | |
| 988 if distance == 0: | |
| 989 return omega * correction_term | |
| 990 else: | |
| 991 return (1/float(distance)) * correction_term | |
| 992 | |
| 993 | |
| 994 def trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose): | |
| 995 """ | |
| 996 Pheromone Update function accorinding to the quality of the solution | |
| 997 """ | |
| 998 terr, BP = t | |
| 999 bpstack, LP = getbpStack(c_s) | |
| 1000 | |
| 1001 struct_correction_term , GC_correction_term, seq_correction_term = correction_terms | |
| 1002 omega = 2.23 | |
| 1003 | |
| 1004 bs = updateValue(ds, struct_correction_term, omega) | |
| 1005 bGC = updateValue(dgc, GC_correction_term, omega) | |
| 1006 if dseq != "n.a.": | |
| 1007 bSeq = updateValue(dseq, seq_correction_term, omega) | |
| 1008 d = bs + bGC + bSeq | |
| 1009 else: | |
| 1010 d = bs + bGC | |
| 1011 transitions = getTransitions(p) | |
| 1012 | |
| 1013 for trans in xrange(len(transitions)): # for each transition in the path | |
| 1014 id1 = int(transitions[trans].split(".")[0]) | |
| 1015 tar_id2 = int(BPstack[id1][1][0]) # getting requested position id2 | |
| 1016 curr_id2 = int(bpstack[id1]) # getting the current situation | |
| 1017 multiplicator = 0 | |
| 1018 if tar_id2 == curr_id2 and id1 != tar_id2 and id1 != curr_id2: # case of a base pair, having both brackets on the correct position | |
| 1019 multiplicator = 1 | |
| 1020 elif tar_id2 == curr_id2 and id1 == tar_id2 and id1 == curr_id2: # case of a single stranded base in both structures | |
| 1021 multiplicator = 1 | |
| 1022 p, l, c = terr[transitions[trans]] # getting the pheromone and the length value of the single path transition | |
| 1023 p += d * multiplicator | |
| 1024 terr[transitions[trans]] = (p, l, c) # updating the values wihtin the terrain's | |
| 1025 t = (terr, BP) | |
| 1026 | |
| 1027 | |
| 1028 def updateTerrain(p, c_s, s, ds, dgc, dseq, dn, t, er, correction_terms, BPstack, verbose, ant_count): | |
| 1029 """ | |
| 1030 General updating function | |
| 1031 """ | |
| 1032 evaporate(t,er) | |
| 1033 trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose) | |
| 1034 | |
| 1035 | |
| 1036 def getUsedTime(start_time): | |
| 1037 """ | |
| 1038 Return the used time between -start time- and now. | |
| 1039 """ | |
| 1040 end_time = time.time() | |
| 1041 return end_time - start_time | |
| 1042 | |
| 1043 | |
| 1044 def good2Go(SC, L, CC, STR): | |
| 1045 """ | |
| 1046 Check, if all input is correct and runnable | |
| 1047 """ | |
| 1048 if (SC == 1 and L == 1 and CC == 1 and STR == 1): | |
| 1049 return True | |
| 1050 else: | |
| 1051 print SC,L,CC,STR | |
| 1052 return False | |
| 1053 | |
| 1054 | |
| 1055 def getPathFromSelection( aps, s, terrain, alpha, beta, RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy): | |
| 1056 """ | |
| 1057 Returns the winning path from a selection of pathes... | |
| 1058 """ | |
| 1059 terr, BPs = terrain | |
| 1060 win_path = 0 | |
| 1061 for i in xrange(aps): | |
| 1062 # Generate Sequence | |
| 1063 path = getPath(s, terr, BPs, alpha, beta, IUPAC, IUPAC_reverseComplements) | |
| 1064 # Measure sequence features and transform them into singular distances | |
| 1065 distance_structural = float(getStructuralDistance(s, SC , path, RNAfold, verbose, LP, BPs, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy)) | |
| 1066 distance_GC = float(getGCDistance(GC,getGC(path), len(path))) | |
| 1067 distance_seq = float(getSequenceEditDistance(SC, path)) | |
| 1068 # Calculate Distance Score | |
| 1069 D = distance_structural + distance_GC + distance_seq | |
| 1070 | |
| 1071 # SELECT THE BEST-OUT-OF-k-SOLUTIONS according to distance score | |
| 1072 if i == 0: | |
| 1073 win_path = (path, D, distance_structural, distance_GC, distance_seq) | |
| 1074 else: | |
| 1075 if D < win_path[1]: | |
| 1076 win_path = (path, D, distance_structural, distance_GC, distance_seq) | |
| 1077 return win_path | |
| 1078 | |
| 1079 | |
| 1080 def substr(x, string, subst): | |
| 1081 """ | |
| 1082 Classical substring function | |
| 1083 """ | |
| 1084 s1 = string[:x-1] | |
| 1085 | |
| 1086 s2 = string[x-1:x] | |
| 1087 s3 = string[x:] | |
| 1088 #s2 = s[x+len(string)-x-1:] | |
| 1089 | |
| 1090 return s1 + subst + s3 | |
| 1091 | |
| 1092 | |
| 1093 def inConvergenceCorridor(d_struct, d_gc, BS_d_struct, BS_d_gc): | |
| 1094 """ | |
| 1095 Check if a solutions qualities are within the convergence corridor | |
| 1096 """ | |
| 1097 struct_var = ((BS_d_struct/float(4)) + 3 ) * 4 | |
| 1098 gc_var = (BS_d_gc + 1/float(100) * 5) + BS_d_gc + 1 | |
| 1099 | |
| 1100 if d_struct <= struct_var and d_gc <= gc_var: | |
| 1101 return True | |
| 1102 else: | |
| 1103 return False | |
| 1104 | |
| 1105 def getGCSamplingValue(GC, tGCmax, tGCvar): | |
| 1106 """ | |
| 1107 Returns a suitable GC value, dependend on the user input: Either returning the single GC value, | |
| 1108 which the user entered, or a smpled GC value | |
| 1109 from a designated distribution in it's interavals | |
| 1110 """ | |
| 1111 returnval = 0 | |
| 1112 if tGCmax == -1.0 and tGCvar == -1.0: # regular plain tGC value as requested | |
| 1113 return GC | |
| 1114 elif tGCmax != -1.0 and tGCvar == -1.0: # uniform distribution tGC value sampling | |
| 1115 if GC < tGCmax: | |
| 1116 tmp_GC = tGCmax | |
| 1117 tGCmax = GC | |
| 1118 GC = tmp_GC | |
| 1119 while returnval <= 0: | |
| 1120 returnval = float(numpy.random.uniform(low=GC, high=tGCmax, size=1)) | |
| 1121 return returnval | |
| 1122 elif tGCmax == -1.0 and tGCvar != -1.0: # normal distribution tGC value sampling | |
| 1123 while returnval <= 0: | |
| 1124 returnval = float(numpy.random.normal(GC, tGCvar, 1)) | |
| 1125 return returnval | |
| 1126 | |
| 1127 | |
| 1128 def reachableGC(C_struct): | |
| 1129 """ | |
| 1130 Checks if a demanded GC target content is reachable in dependence with the given sequence constraint. | |
| 1131 """ | |
| 1132 AU = 0 | |
| 1133 for i in C_struct: | |
| 1134 if i == "A" or i == "U": | |
| 1135 AU += 1 | |
| 1136 maxGC = 1 - (AU / float(len(C_struct))) # 1 - min_GC | |
| 1137 return maxGC | |
| 1138 | |
| 1139 | |
| 1140 def runColony(s, SC, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy): | |
| 1141 """ | |
| 1142 Execution function of a single ant colony finding one solution sequence | |
| 1143 """ | |
| 1144 retString = "" | |
| 1145 retString2 = [] | |
| 1146 BPstack, LP = getBPStack(s, SC) | |
| 1147 | |
| 1148 rGC = reachableGC(SC) | |
| 1149 GC_message = "" | |
| 1150 if GC > rGC: | |
| 1151 print >> sys.stderr, "WARNING: Chosen target GC %s content is not reachable due to sequence constraint! Sequence Constraint GC-content is: %s" % (GC, rGC) | |
| 1152 GC = rGC | |
| 1153 | |
| 1154 # Initial Constraint Checks prior to execution | |
| 1155 STR = isValidStructure(s) | |
| 1156 START_SC , SC = checkSequenceConstraint(str(SC)) | |
| 1157 START_LENGTH = checkSimilarLength(str(s), str(SC)) | |
| 1158 START_constraint_compatibility , CompReport = checkConstaintCompatibility(BPstack, SC, IUPAC_compatibles) | |
| 1159 | |
| 1160 g2g = good2Go(START_SC, START_LENGTH, START_constraint_compatibility, STR) | |
| 1161 if (g2g == 1): | |
| 1162 start_time = time.time() | |
| 1163 max_time = 600 # seconds | |
| 1164 | |
| 1165 | |
| 1166 | |
| 1167 | |
| 1168 #### | |
| 1169 # INITIALIZATION OF THE RNA TOOLs | |
| 1170 # | |
| 1171 RNAfold = init_RNAfold(213, temperature, paramFile) | |
| 1172 #RNAdistance = init_RNAdistance() | |
| 1173 RNAfold_pattern = re.compile('.+\n([.()]+)\s.+') | |
| 1174 #RNAdist_pattern = re.compile('.*\s([\d]+)') | |
| 1175 # | |
| 1176 #### | |
| 1177 | |
| 1178 terrain = initTerrain(s) | |
| 1179 #print len(terrain), | |
| 1180 terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements) | |
| 1181 #print len(terrain[0]) | |
| 1182 #printTerrain(terrain) | |
| 1183 #exit(0) | |
| 1184 global_ant_count = 0 | |
| 1185 global_best_ants = 0 | |
| 1186 criterion = False | |
| 1187 met = True | |
| 1188 ant_no = 1 | |
| 1189 prev_res = 0 | |
| 1190 seq = "" | |
| 1191 | |
| 1192 counter = 0 | |
| 1193 | |
| 1194 dstruct_log = [] | |
| 1195 dGC_log = [] | |
| 1196 | |
| 1197 | |
| 1198 distance_structural = 1000 | |
| 1199 distance_GC = 1000 | |
| 1200 distance_seq = 1000 | |
| 1201 | |
| 1202 convergence = convergence_count | |
| 1203 convergence_counter = 0 | |
| 1204 | |
| 1205 resets = 0 | |
| 1206 | |
| 1207 path = "" | |
| 1208 curr_structure = "" | |
| 1209 | |
| 1210 Dscore = 100000 | |
| 1211 distance_structural = 10000 | |
| 1212 distance_GC = 10000 | |
| 1213 distance_seq = 10000 | |
| 1214 best_solution = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
| 1215 best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
| 1216 | |
| 1217 best_solution_since = 0 | |
| 1218 | |
| 1219 ants_per_selection = 10 | |
| 1220 if len(LP) > 0 : | |
| 1221 for lp in LP: | |
| 1222 s = substr(lp + 1, s, ".") | |
| 1223 s = substr(LP[lp] + 1, s, ".") | |
| 1224 | |
| 1225 init = 1 | |
| 1226 while criterion != met and getUsedTime(start_time) < max_time: | |
| 1227 iteration_start = time.time() | |
| 1228 global_ant_count += 1 | |
| 1229 global_best_ants += 1 | |
| 1230 | |
| 1231 path_info = getPathFromSelection(ants_per_selection, s, terrain, alpha, beta, RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy) | |
| 1232 | |
| 1233 distance_structural_prev = distance_structural | |
| 1234 distance_GC_prev = distance_GC | |
| 1235 distance_seq_prev = distance_seq | |
| 1236 | |
| 1237 path, Dscore , distance_structural, distance_GC, distance_seq = path_info | |
| 1238 curr_structure = "" | |
| 1239 if pseudoknots: | |
| 1240 curr_structure = getPKStructure(path, strategy) | |
| 1241 else: | |
| 1242 curr_structure = getRNAfoldStructure(path, RNAfold) | |
| 1243 | |
| 1244 curr_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
| 1245 # BEST SOLUTION PICKING | |
| 1246 if improve == "h": # hierarchical check | |
| 1247 # for the global best solution | |
| 1248 if distance_structural < best_solution[3] or (distance_structural == best_solution[3] and distance_GC < best_solution[4]): | |
| 1249 best_solution = curr_solution | |
| 1250 ant_no = 1 | |
| 1251 # for the local (reset) best solution | |
| 1252 if distance_structural < best_solution_local[3] or (distance_structural == best_solution_local[3] and distance_GC < best_solution_local[4]): | |
| 1253 best_solution_local = curr_solution | |
| 1254 | |
| 1255 elif improve == "s": #score based check | |
| 1256 # store best global solution | |
| 1257 if Dscore < best_solution[2]: | |
| 1258 best_solution = curr_solution | |
| 1259 ant_no = 1 | |
| 1260 # store best local solution for this reset | |
| 1261 if Dscore < best_solution_local[2]: | |
| 1262 best_solution_local = curr_solution | |
| 1263 | |
| 1264 # OLD ' BEST SOLUTION ' PICKING | |
| 1265 # if Dscore < best_solution[2]: | |
| 1266 # best_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
| 1267 # | |
| 1268 # if Dscore < best_solution_local[2]: | |
| 1269 # best_solution_local = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
| 1270 | |
| 1271 | |
| 1272 distance_DN = 0 | |
| 1273 | |
| 1274 if verbose: | |
| 1275 print "SCORE " + str(Dscore) + " Resets " + str(resets) + " #Ant " + str(global_ant_count) + " out of " + str(ants_per_selection) + " cc " + str(convergence_counter) | |
| 1276 | |
| 1277 print s, " <- target struct" | |
| 1278 print best_solution[0] , " <- BS since ", str(best_solution_since), "Size of Terrrain:", len(terrain[0]) | |
| 1279 print best_solution[1] , " <- BS Dscore " + str(best_solution[2]) + " ds " + str(best_solution[3]) + " dGC " + str(best_solution[4]) + " dseq " + str(best_solution[5])+ " LP " + str(len(LP)) + " <- best solution stats" | |
| 1280 print curr_structure, " <- CS" | |
| 1281 print path, | |
| 1282 print " <- CS", "Dscore", str(Dscore), "ds", distance_structural, "dGC", distance_GC, "GC", getGC(path)*100, "Dseq", distance_seq | |
| 1283 | |
| 1284 #### UPDATING THE TERRAIN ACCORDING TO THE QUALITY OF THE CURRENT BESTO-OUT-OF-k SOLUTION | |
| 1285 updateTerrain(path, curr_structure, s, distance_structural,distance_GC, distance_seq, distance_DN, terrain, evaporation_rate, correction_terms, BPstack, verbose, global_ant_count) | |
| 1286 #### | |
| 1287 if verbose: print "Used time for one iteration", time.time() - iteration_start | |
| 1288 | |
| 1289 | |
| 1290 # CONVERGENCE AND TERMINATION CRITERION MANAGEMENT | |
| 1291 #print distance_structural, distance_GC, best_solution_local[3], best_solution_local[4] | |
| 1292 if inConvergenceCorridor(curr_solution[3], curr_solution[4], best_solution_local[3], best_solution_local[4]): | |
| 1293 convergence_counter += 1 | |
| 1294 if distance_structural_prev == distance_structural and distance_GC_prev == distance_GC: | |
| 1295 convergence_counter += 1 | |
| 1296 | |
| 1297 if best_solution[3] == objective_to_target_distance: | |
| 1298 if best_solution[4] == 0.0: | |
| 1299 break | |
| 1300 ant_no = ant_no + 1 | |
| 1301 convergence_counter -= 1 | |
| 1302 else: | |
| 1303 ant_no = 1 | |
| 1304 | |
| 1305 | |
| 1306 if ant_no == termination_convergence or resets >= reset_limit or global_ant_count >= 100000 or best_solution_since == 5: | |
| 1307 break | |
| 1308 | |
| 1309 # RESET | |
| 1310 if ant_no < termination_convergence and convergence_counter >= convergence: | |
| 1311 | |
| 1312 terrain = initTerrain(s) | |
| 1313 terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements) | |
| 1314 criterion = False | |
| 1315 met = True | |
| 1316 ant_no = 1 | |
| 1317 prev_res = 0 | |
| 1318 pre_path = "_" * len(s) | |
| 1319 path = "" | |
| 1320 curr_structure = "" | |
| 1321 counter = 0 | |
| 1322 Dscore = 100000 | |
| 1323 distance_structural = 1000 | |
| 1324 distance_GC = 1000 | |
| 1325 distance_seq = 1000 | |
| 1326 best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq) | |
| 1327 convergence = convergence_count | |
| 1328 convergence_counter = 0 | |
| 1329 | |
| 1330 if resets == 0: | |
| 1331 sentinel_solution = best_solution | |
| 1332 best_solution_since += 1 | |
| 1333 else: | |
| 1334 if best_solution[2] < sentinel_solution[2]: | |
| 1335 sentinel_solution = best_solution | |
| 1336 best_solution_since = 0 | |
| 1337 else: | |
| 1338 best_solution_since += 1 | |
| 1339 | |
| 1340 resets += 1 | |
| 1341 | |
| 1342 duration = getUsedTime(start_time) | |
| 1343 | |
| 1344 retString += "|Ants:" + str(global_ant_count) | |
| 1345 retString += "|Resets:" + str(resets) + "/" + str(reset_limit) | |
| 1346 retString += "|AntsTC:" + str(termination_convergence) | |
| 1347 retString += "|CC:" + str(convergence_count) | |
| 1348 retString += "|IP:" + str(improve) | |
| 1349 retString += "|BSS:" + str(best_solution_since) | |
| 1350 #if GC_message != "": | |
| 1351 # retString += GC_message + "\n" | |
| 1352 | |
| 1353 sequence = best_solution[0] | |
| 1354 struct = best_solution[1] | |
| 1355 | |
| 1356 retString += "|LP:" + str(len(LP)) | |
| 1357 retString += "|ds:" + str(getStructuralDistance(s,SC, sequence, RNAfold, verbose, LP, BPstack, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy)) | |
| 1358 retString += "|dGC:" + str(best_solution[4]) | |
| 1359 retString += "|GC:" + str(getGC(sequence)*100) | |
| 1360 retString += "|dseq:" + str(getSequenceEditDistance(SC, sequence)) | |
| 1361 retString += "|L:" + str(len(sequence)) | |
| 1362 retString += "|Time:" + str(duration) | |
| 1363 | |
| 1364 retString2.append(struct) | |
| 1365 retString2.append(sequence) | |
| 1366 | |
| 1367 # CLOSING THE PIPES TO THE PROGRAMS | |
| 1368 RNAfold.communicate() | |
| 1369 #RNAdistance.communicate() | |
| 1370 | |
| 1371 else: # Structural premisses are not met, htherefore the program will halt with a failure message | |
| 1372 retString += "\nSome mistake detected\n" | |
| 1373 retString += "SequenceConstraintCheck: " + str(START_SC) + "\nSequenceConstraint: " + str(SC) + "\nLengthCheck: " + str(START_LENGTH) + "\nConstraintCompatibility: " + str(START_constraint_compatibility)+ "\n" + CompReport + "\n" | |
| 1374 | |
| 1375 return (retString, retString2) | |
| 1376 | |
| 1377 def findSequence(structure, Cseq, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots, strategy, useGU, return_mod = False): | |
| 1378 """ | |
| 1379 MAIN antaRNA - ant assembled RNA | |
| 1380 """ | |
| 1381 | |
| 1382 if seed != "none": | |
| 1383 random.seed(seed) | |
| 1384 | |
| 1385 if Cseq == "": | |
| 1386 sequenceconstraint = "N" * len(structure) | |
| 1387 else: | |
| 1388 sequenceconstraint = str(Cseq) | |
| 1389 | |
| 1390 alpha = float(alpha) | |
| 1391 beta = float(beta) | |
| 1392 tGC = float(tGC) | |
| 1393 evaporation_rate = float(evaporation_rate) | |
| 1394 struct_correction_term = float(struct_correction_term) | |
| 1395 GC_correction_term = float(GC_correction_term) | |
| 1396 seq_correction_term = float(seq_correction_term) | |
| 1397 colonies = int(colonies) | |
| 1398 file_id = str(file_id) | |
| 1399 tmp_verbose = verbose | |
| 1400 tmp_output_verbose = output_verbose | |
| 1401 verbose = tmp_output_verbose # Due to later change, this is a twistaround and a switching of purpose | |
| 1402 output_verbose = tmp_verbose # Due to later change, this is a twistaround and a switching of purpose | |
| 1403 correction_terms = struct_correction_term, GC_correction_term, seq_correction_term | |
| 1404 temperature = float(temperature) | |
| 1405 print_to_STDOUT = (file_id == "STDOUT") | |
| 1406 | |
| 1407 useGU = useGU | |
| 1408 | |
| 1409 if return_mod == False: | |
| 1410 if print_to_STDOUT == False: | |
| 1411 outfolder = '/'.join(file_id.strip().split("/")[:-1]) | |
| 1412 curr_dir = os.getcwd() | |
| 1413 if not os.path.exists(outfolder): | |
| 1414 os.makedirs(outfolder) | |
| 1415 os.chdir(outfolder) | |
| 1416 | |
| 1417 | |
| 1418 sequenceconstraint = transform(sequenceconstraint) | |
| 1419 ############################################### | |
| 1420 | |
| 1421 # Allowed deviation from the structural target: | |
| 1422 objective_to_target_distance = 0.0 | |
| 1423 | |
| 1424 # Loading the IUPAC copatibilities of nuleotides and their abstract representing symbols | |
| 1425 IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"} | |
| 1426 IUPAC_compatibles = loadIUPACcompatibilities(IUPAC, useGU) | |
| 1427 | |
| 1428 IUPAC_reverseComplements = {} | |
| 1429 if useGU == False: ## Without the GU basepair | |
| 1430 IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"C", "U":"A", "R":"UC", "Y":"AG", "S":"GC", "W":"UA","K":"CA", "M":"UG", "B":"AGC", "D":"ACU", "H":"UGA", "V":"UGC", "N":"ACGU"} | |
| 1431 else: ## allowing the GU basepair | |
| 1432 IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"UC", "U":"AG", "R":"UC", "Y":"AG", "S":"UGC", "W":"UAG","K":"UCAG", "M":"UG", "B":"AGCU", "D":"AGCU", "H":"UGA", "V":"UGC", "N":"ACGU"} | |
| 1433 | |
| 1434 result = [] | |
| 1435 for col in xrange(colonies): | |
| 1436 # Checking the kind of taget GC value should be used | |
| 1437 GC = getGCSamplingValue(tGC, tGCmax, tGCvar) | |
| 1438 | |
| 1439 # Actual execution of a ant colony procesdure | |
| 1440 output_v, output_w = runColony(structure, sequenceconstraint, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy) | |
| 1441 | |
| 1442 # Post-Processing the output of a ant colony procedure | |
| 1443 line = ">" + name + str(col) | |
| 1444 if output_verbose: | |
| 1445 line += "|Cstr:" + structure + "|Cseq:" + sequenceconstraint + "|Alpha:" + str(alpha) + "|Beta:" + str(beta) + "|tGC:" + str(GC) + "|ER:" + str(evaporation_rate) + "|Struct_CT:" + str(struct_correction_term) + "|GC_CT:" + str(GC_correction_term) + "|Seq_CT:" + str(seq_correction_term) + output_v + "\n" + "\n".join(output_w) | |
| 1446 else: | |
| 1447 line += "\n" + output_w[1] | |
| 1448 if return_mod == False: | |
| 1449 if print_to_STDOUT: | |
| 1450 print line | |
| 1451 else: | |
| 1452 if col == 0: | |
| 1453 print2file(file_id, line, 'w') | |
| 1454 else: | |
| 1455 print2file(file_id, line, 'a') | |
| 1456 else: | |
| 1457 result.append(line) | |
| 1458 | |
| 1459 if return_mod == True: | |
| 1460 return result | |
| 1461 if print_to_STDOUT == False: | |
| 1462 os.chdir(curr_dir) | |
| 1463 | |
| 1464 def execute(args): | |
| 1465 """ | |
| 1466 CHECK AND PARSE THE COMMAND LINE STUFF | |
| 1467 """ | |
| 1468 | |
| 1469 # Checking the arguments, parsed from the shell | |
| 1470 ############################################### | |
| 1471 name = args.name | |
| 1472 structure = args.Cstr | |
| 1473 | |
| 1474 if args.Cseq == "": | |
| 1475 sequenceconstraint = "N" * len(structure) | |
| 1476 else: | |
| 1477 sequenceconstraint = args.Cseq | |
| 1478 | |
| 1479 seed = args.seed | |
| 1480 | |
| 1481 | |
| 1482 alpha = args.alpha | |
| 1483 beta = args.beta | |
| 1484 tGC = args.tGC | |
| 1485 if tGC < 0 or tGC > 1: | |
| 1486 print "Error: Chosen tGC not in range [0,1]" | |
| 1487 exit(1) | |
| 1488 evaporation_rate = args.ER | |
| 1489 struct_correction_term = args.Cstrweight | |
| 1490 GC_correction_term = args.Cgcweight | |
| 1491 seq_correction_term = args.Cseqweight | |
| 1492 colonies = args.noOfColonies | |
| 1493 degreeOfSequenceInducement = args.level | |
| 1494 file_id = args.output_file | |
| 1495 verbose = args.verbose | |
| 1496 output_verbose = args.output_verbose | |
| 1497 | |
| 1498 tGCmax = args.tGCmax | |
| 1499 tGCvar = args.tGCvar | |
| 1500 | |
| 1501 termination_convergence = args.antsTerConv | |
| 1502 convergence_count = args.ConvergenceCount | |
| 1503 temperature = args.temperature | |
| 1504 reset_limit = args.Resets | |
| 1505 | |
| 1506 improve = args.improve_procedure | |
| 1507 | |
| 1508 ### RNAfold parameterfile | |
| 1509 paramFile = args.paramFile | |
| 1510 | |
| 1511 # Using the pkiss program under user changeable parameters | |
| 1512 pseudoknots = args.pseudoknots | |
| 1513 | |
| 1514 # Loading the optimized parameters for pseudoknots and ignore user input | |
| 1515 if args.pseudoknot_parameters: | |
| 1516 alpha = 1.0 | |
| 1517 beta = 0.1 | |
| 1518 evaporation_rate = 0.2 | |
| 1519 struct_correction_term = 0.1 | |
| 1520 GC_correction_term = 1.0 | |
| 1521 seq_correction_term = 0.5 | |
| 1522 termination_convergence = 50 | |
| 1523 convergence_count = 100 | |
| 1524 | |
| 1525 | |
| 1526 strategy = args.strategy | |
| 1527 useGU = args.useGUBasePair | |
| 1528 | |
| 1529 checkForViennaTools() | |
| 1530 if pseudoknots: | |
| 1531 checkForpKiss() | |
| 1532 findSequence(structure, sequenceconstraint, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots, strategy, useGU) | |
| 1533 | |
| 1534 | |
| 1535 def exe(): | |
| 1536 """ | |
| 1537 MAIN EXECUTABLE WHICH PARSES THE INPUT LINE | |
| 1538 """ | |
| 1539 | |
| 1540 argument_parser = argparse.ArgumentParser( | |
| 1541 description = """ | |
| 1542 | |
| 1543 ######################################################################### | |
| 1544 # antaRNA - ant assembled RNA # | |
| 1545 # -> Ant Colony Optimized RNA Sequence Design # | |
| 1546 # ------------------------------------------------------------ # | |
| 1547 # Robert Kleinkauf (c) 2015 # | |
| 1548 # Bioinformatics, Albert-Ludwigs University Freiburg, Germany # | |
| 1549 ######################################################################### | |
| 1550 | |
| 1551 - For antaRNA only the VIENNNA RNA Package must be installed on your linux system. | |
| 1552 antaRNA will only check, if the executables of RNAfold and RNAdistance of the ViennaRNA package can be found. If those programs are | |
| 1553 not installed correctly, no output will be generated, an also no warning will be prompted. | |
| 1554 So the binary path of the Vienna Tools must be set up correctly in your system's PATH variable in order to run antaRNA correctly! | |
| 1555 | |
| 1556 - antaRNA was only tested under Linux. | |
| 1557 | |
| 1558 - For questions and remarks please feel free to contact us at http://www.bioinf.uni-freiburg.de/ | |
| 1559 | |
| 1560 """, | |
| 1561 | |
| 1562 epilog = """ | |
| 1563 Example calls: | |
| 1564 python antaRNA.py --Cstr "...(((...)))..." --tGC 0.5 -n 2 | |
| 1565 python antaRNA.py --Cstr ".........AAA(((...)))AAA........." --tGC 0.5 -n 10 --output_file /path/to/antaRNA_TESTRUN -ov | |
| 1566 python antaRNA.py --Cstr "BBBBB....AAA(((...)))AAA....BBBBB" --Cseq "NNNNANNNNNCNNNNNNNNNNNGNNNNNNUNNN" --tGC 0.5 -n 10 | |
| 1567 | |
| 1568 ######################################################################### | |
| 1569 # --- Hail to the Queen!!! All power to the swarm!!! --- # | |
| 1570 ######################################################################### | |
| 1571 """, | |
| 1572 #formatter_class=RawTextHelpFormatter | |
| 1573 ) | |
| 1574 | |
| 1575 # mandatorys | |
| 1576 argument_parser.add_argument("-Cstr", "--Cstr", help="Structure constraint using RNA dotbracket notation with fuzzy block constraint. \n(TYPE: %(type)s)\n\n", type=str, required=True) | |
| 1577 argument_parser.add_argument("-tGC", "--tGC", help="Objective target GC content in [0,1].\n(TYPE: %(type)s)\n\n", type=float, required=True) | |
| 1578 argument_parser.add_argument("-n", "--noOfColonies", help="Number of sequences which shall be produced. \n(TYPE: %(type)s)\n\n\n\n", type=int, default=1) | |
| 1579 argument_parser.add_argument("-GU", "--useGUBasePair", help="Allowing GU base pairs. \n\n", action="store_true") | |
| 1580 | |
| 1581 argument_parser.add_argument("-s", "--seed", help = "Provides a seed value for the used pseudo random number generator.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="none") | |
| 1582 argument_parser.add_argument("-ip", "--improve_procedure", help = "Select the improving method. h=hierarchical, s=score_based.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="s") | |
| 1583 argument_parser.add_argument("-r", "--Resets", help = "Amount of maximal terrain resets, until the best solution is retuned as solution.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=5) | |
| 1584 argument_parser.add_argument("-CC", "--ConvergenceCount", help = "Delimits the convergence count criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=130) | |
| 1585 argument_parser.add_argument("-aTC", "--antsTerConv", help = "Delimits the amount of internal ants for termination convergence criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=50) | |
| 1586 | |
| 1587 argument_parser.add_argument("-p", "--pseudoknots", help = "Switch to pseudoknot based prediction using pKiss. Check the pseudoknot parameter usage!!!\n\n", action="store_true") | |
| 1588 argument_parser.add_argument("-pkPar", "--pseudoknot_parameters", help = "Enable optimized parameters for the usage of pseudo knots (Further parameter input ignored).\n\n", action="store_true") | |
| 1589 argument_parser.add_argument("--strategy", help = "Defining the pKiss folding strategy.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="A") | |
| 1590 | |
| 1591 # Mutual Exclusiv target GC distribution variables | |
| 1592 #tGCgroup = argument_parser.add_mutually_exclusive_group() | |
| 1593 argument_parser.add_argument("-tGCmax", "--tGCmax", help = "Provides a maximum tGC value [0,1] for the case of uniform distribution sampling. The regular tGC value serves as minimum value.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0) | |
| 1594 argument_parser.add_argument("-tGCvar", "--tGCvar", help = "Provides a tGC variance (sigma square) for the case of normal distribution sampling. The regular tGC value serves as expectation value (mu).\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0) | |
| 1595 | |
| 1596 argument_parser.add_argument("-t", "--temperature", help = "Provides a temperature for the folding algorithms.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=37.0) | |
| 1597 argument_parser.add_argument("-P", "--paramFile", help = "Changes the energy parameterfile of RNAfold. If using this explicitly, please provide a suitable energy file delivered by RNAfold. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="") | |
| 1598 argument_parser.add_argument("-of","--output_file", help="Provide a path and an output file, e.g. \"/path/to/the/target_file\". \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="STDOUT") | |
| 1599 argument_parser.add_argument("-Cseq", "--Cseq", help="Sequence constraint using RNA nucleotide alphabet {A,C,G,U} and wild-card \"N\". \n(TYPE: %(type)s)\n\n", type=str, default = "") | |
| 1600 argument_parser.add_argument("-l", "--level", help="Sets the level of allowed influence of sequence constraint on the structure constraint [0:no influence; 3:extensive influence].\n(TYPE: %(type)s)\n\n", type=int, default = 1) | |
| 1601 argument_parser.add_argument("--name", help="Defines a name which is used in the sequence output. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="antaRNA_") | |
| 1602 argument_parser.add_argument("-a", "--alpha", help="Sets alpha, probability weight for terrain pheromone influence. [0,1] \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0) | |
| 1603 argument_parser.add_argument("-b", "--beta", help="Sets beta, probability weight for terrain path influence. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0) | |
| 1604 argument_parser.add_argument("-er", "--ER", help="Pheromone evaporation rate. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.2) | |
| 1605 argument_parser.add_argument("-Cstrw", "--Cstrweight", help="Structure constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.5) | |
| 1606 argument_parser.add_argument("-Cgcw", "--Cgcweight", help="GC content constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=5.0) | |
| 1607 argument_parser.add_argument("-Cseqw", "--Cseqweight", help="Sequence constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n\n", type=float, default=1.0) | |
| 1608 argument_parser.add_argument("-ov", "--output_verbose", help="Displayes intermediate output.\n\n", action="store_true") | |
| 1609 argument_parser.add_argument("-v", "--verbose", help="Prints additional features and stats to the headers of the produced sequences. Also adds the structure of the sequence.\n\n", action="store_true") | |
| 1610 | |
| 1611 args = argument_parser.parse_args() | |
| 1612 | |
| 1613 execute(args) | |
| 1614 | |
| 1615 def checkForViennaTools(): | |
| 1616 """ | |
| 1617 Checking for the presence of the Vienna tools in the system by which'ing for RNAfold and RNAdistance | |
| 1618 """ | |
| 1619 RNAfold_output = subprocess.Popen(["which", "RNAfold"], stdout=subprocess.PIPE).communicate()[0].strip() | |
| 1620 if len(RNAfold_output) > 0 and RNAfold_output.find("found") == -1 and RNAfold_output.find(" no ") == -1: | |
| 1621 return True | |
| 1622 else: | |
| 1623 print "It seems the Vienna RNA Package is not installed on your machine. Please do so!" | |
| 1624 print "You can get it at http://www.tbi.univie.ac.at/" | |
| 1625 exit(0) | |
| 1626 | |
| 1627 | |
| 1628 def checkForpKiss(): | |
| 1629 """ | |
| 1630 Checking for the presence of pKiss | |
| 1631 """ | |
| 1632 pKiss_output = subprocess.Popen(["which", "pKiss"], stdout=subprocess.PIPE).communicate()[0].strip() | |
| 1633 if len(pKiss_output) > 0 and pKiss_output.find("found") == -1 and pKiss_output.find(" no ") == -1: | |
| 1634 return True | |
| 1635 else: | |
| 1636 print "It seems that pKiss is not installed on your machine. Please do so!" | |
| 1637 print "You can get it at http://bibiserv2.cebitec.uni-bielefeld.de/pkiss" | |
| 1638 exit(0) | |
| 1639 | |
| 1640 | |
| 1641 | |
| 1642 if __name__ == "__main__": | |
| 1643 | |
| 1644 exe() | |
| 1645 |
