changeset 12:3e1b66d58f94 draft

Uploaded
author bornea
date Tue, 12 Apr 2016 13:24:36 -0400
parents b688d0dae86b
children febb6def95cb
files SAINT_preprocessing.py
diffstat 1 files changed, 15 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/SAINT_preprocessing.py	Tue Apr 12 12:54:38 2016 -0400
+++ b/SAINT_preprocessing.py	Tue Apr 12 13:24:36 2016 -0400
@@ -199,9 +199,22 @@
         Scaffold_line[4] = Scaffold_line[4].split()[0]
         # Removes the (+##) that sometimes is attached.
     uniprot_re = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
-    for protein in data: 
+    for protein in data:
         prot_id = uniprot_re.match(protein[prot_start])
-        proteins.append(prot_id.group())
+        if prot_id: 
+            proteins.append(prot_id.group())
+        else:
+            prot_ids = protein[prot_start].split("|")
+            for prot_id in prot_ids:
+                if "_HUMAN" in prot_id:
+                    proteins.append(prot_id)
+                elif "_YEAST" in prot_id:
+                    proteins.append(prot_id)
+                elif "_MOUSE" in prot_id:
+                    proteins.append(prot_id)
+                else: 
+                    print "Accession must be uniprot ID or gene name"
+                    sys.exit()
     return ReturnValue2(data, proteins, header)