# HG changeset patch # User bornea # Date 1460481876 14400 # Node ID 3e1b66d58f941d3247687bdf450a54b006925802 # Parent b688d0dae86b0cc140d4908a727017fd073a09bb Uploaded diff -r b688d0dae86b -r 3e1b66d58f94 SAINT_preprocessing.py --- a/SAINT_preprocessing.py Tue Apr 12 12:54:38 2016 -0400 +++ b/SAINT_preprocessing.py Tue Apr 12 13:24:36 2016 -0400 @@ -199,9 +199,22 @@ Scaffold_line[4] = Scaffold_line[4].split()[0] # Removes the (+##) that sometimes is attached. uniprot_re = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") - for protein in data: + for protein in data: prot_id = uniprot_re.match(protein[prot_start]) - proteins.append(prot_id.group()) + if prot_id: + proteins.append(prot_id.group()) + else: + prot_ids = protein[prot_start].split("|") + for prot_id in prot_ids: + if "_HUMAN" in prot_id: + proteins.append(prot_id) + elif "_YEAST" in prot_id: + proteins.append(prot_id) + elif "_MOUSE" in prot_id: + proteins.append(prot_id) + else: + print "Accession must be uniprot ID or gene name" + sys.exit() return ReturnValue2(data, proteins, header)