changeset 11:b688d0dae86b draft

Uploaded
author bornea
date Tue, 12 Apr 2016 12:54:38 -0400
parents 1b0547d3c7bc
children 3e1b66d58f94
files SAINT_preprocessing.py
diffstat 1 files changed, 5 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/SAINT_preprocessing.py	Tue Mar 29 14:46:04 2016 -0400
+++ b/SAINT_preprocessing.py	Tue Apr 12 12:54:38 2016 -0400
@@ -25,6 +25,7 @@
 
 import sys
 import os.path
+import re
 
 
 infile = sys.argv[1] 
@@ -197,8 +198,10 @@
     for Scaffold_line in data:
         Scaffold_line[4] = Scaffold_line[4].split()[0]
         # Removes the (+##) that sometimes is attached.
-    for protein in data:
-        proteins.append(protein[prot_start])
+    uniprot_re = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
+    for protein in data: 
+        prot_id = uniprot_re.match(protein[prot_start])
+        proteins.append(prot_id.group())
     return ReturnValue2(data, proteins, header)