# HG changeset patch # User bornea # Date 1461958942 14400 # Node ID 378fc3676c788ba5872a7cc834adb6776e165126 # Parent a5444c834e72a502c79f307d509997a8721ff6ad Uploaded diff -r a5444c834e72 -r 378fc3676c78 CRAPomeQuery.py --- a/CRAPomeQuery.py Fri Apr 29 15:42:11 2016 -0400 +++ b/CRAPomeQuery.py Fri Apr 29 15:42:22 2016 -0400 @@ -5,6 +5,12 @@ @author: brentkuenzi """ ################################################################################ +## Dependencies ## +import urllib2 +import sys +import numpy +import os +################################################################################ # This program will read in a SAINT formatted 'prey.txt' file or a file # containing a single column list of uniprot accessions (e.g. "P00533" or # "EGFR_HUMAN")query the CRAPome database (v1.1), and return a file specifying @@ -18,24 +24,19 @@ # A copy of the license is included in the section entitled "GNU # Free Documentation License". ################################################################################ -## Dependencies ## -import urllib2 -import sys -import numpy -import os -################################################################################ ## REQUIRED INPUT ## # 1) crappyData: Prey.txt or single column list of Uniprot accessions crappyData = sys.argv[1] # Prey file or File with single column of accessions # 2) Species: HUMAN or YEAST species = sys.argv[2] # HUMAN or YEAST -db_path = sys.argv[4] +fasta_db = sys.argv[3] +db_path = sys.argv[5] ################################################################################ ## Global Variables ## if species == "HUMAN": - database = "Human_CRAPome_v1-1.txt" + database = str(db_path) + "Human_CRAPome_v1-1.txt" if species == "YEAST": - database = "Yeast_CRAPome_v1-1.txt" + database = str(db_path) + "Yeast_CRAPome_v1-1.txt" ################################################################################ ## CRAPomeQuery ## class ReturnValue1(object): @@ -46,48 +47,75 @@ def get_info(uniprot_accession_in): #get aa lengths and gene name error = open('error proteins.txt', 'a+') i=0 - while i==0: - try: - data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") - break - except urllib2.HTTPError, err: - i = i + 1 - if i == 50: - sys.exit("More than 50 errors. Check your file or try again later.") - if err.code == 404: - error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') - seqlength = 'NA' + if fasta_db == "None": + while i==0: + try: + data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") + break + except urllib2.HTTPError, err: + i = i + 1 + if i == 50: + sys.exit("More than 50 errors. Check your file or try again later.") + if err.code == 404: + error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') + seqlength = 'NA' + genename = 'NA' + return ReturnValue1(seqlength, genename) + elif err.code == 302: + sys.exit("Request timed out. Check connection and try again.") + else: + sys.exit("Uniprot had some other error") + lines = data.readlines() + header = lines[0] + lst = header.split('|') + lst2 = lst[2].split(' ') + swissprot = lst2[0] + uniprot_acc = lst[1] + if lines == []: + error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') + error.close + uniprot_acc = 'NA' + genename = 'NA' + return ReturnValue1(uniprot_acc, genename, swissprot) + if lines != []: + seqlength = 0 + header = lines[0] + if 'GN=' in header: + lst = header.split('GN=') + lst2 = lst[1].split(' ') + genename = lst2[0] + error.close + return ReturnValue1(uniprot_acc, genename, swissprot) + if 'GN=' not in header: genename = 'NA' - return ReturnValue1(seqlength, genename) - elif err.code == 302: - sys.exit("Request timed out. Check connection and try again.") - else: - sys.exit("Uniprot had some other error") - lines = data.readlines() - header = lines[0] - lst = header.split('|') - lst2 = lst[2].split(' ') - swissprot = lst2[0] - uniprot_acc = lst[1] - if lines == []: - error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') - error.close - uniprot_acc = 'NA' - genename = 'NA' - return ReturnValue1(uniprot_acc, genename, swissprot) - if lines != []: - seqlength = 0 - header = lines[0] - if 'GN=' in header: - lst = header.split('GN=') - lst2 = lst[1].split(' ') - genename = lst2[0] + error.close + return ReturnValue1(uniprot_acc, genename, swissprot) + else: + data = open(fasta_db, 'r') + data_lines = data.readlines() + db_len = len(data_lines) + for fasta_line in data_lines: + if uniprot_accession_in in fasta_line: + sp, uniprot_acc, name_info = fasta_line.split("|") + swissprot = name_info.split(' ')[0] + if 'GN=' in name_info: + lst = name_info.split('GN=') + lst2 = lst[1].split(' ') + genename = lst2[0] + return ReturnValue1(uniprot_acc, genename, swissprot) + if 'GN=' not in name_info: + genename = 'NA' + return ReturnValue1(uniprot_acc, genename, swissprot) + if sp = ">sp": + error.close() + else: + error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') error.close - return ReturnValue1(uniprot_acc, genename, swissprot) - if 'GN=' not in header: + uniprot_acc = 'NA' genename = 'NA' - error.close return ReturnValue1(uniprot_acc, genename, swissprot) + + def readtab(infile): # read in tab-delim text with open(infile,'r') as x: output = [] @@ -134,7 +162,7 @@ temp.append(i[0]) # append gene name ave = [] total = len(i[3:]) # calculate total experiments - for j in i[3:]: + for j in i[3:]: if j != '0': ave.append(int(j)) # calculate Ave.SC on only experiments with ID cnt+=1 @@ -148,6 +176,9 @@ else: temp.append(i[2]) # append accession temp.append(i[3]) + temp.append("NA") + temp.append("NA") + temp.append("NA") query.append(temp) # final query results header = ["User Input","Mapped Gene Symbol","Num of Expt. (found/total)","Ave SC","Max SC"] @@ -158,5 +189,5 @@ if __name__ == '__main__': crapome(crappyData) -os.rename("Crappy Data.txt", sys.argv[3]) +os.rename("Crappy Data.txt", sys.argv[4]) ## END ## \ No newline at end of file