Mercurial > repos > bornea > query_crapome
comparison CRAPomeQuery.py @ 0:4d47d78b193a draft
Uploaded
| author | bornea |
|---|---|
| date | Mon, 18 Apr 2016 12:16:53 -0400 |
| parents | |
| children | d1a26feef9de |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4d47d78b193a |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """ | |
| 3 Created on Thu Apr 14 16:58:05 2016 | |
| 4 | |
| 5 @author: brentkuenzi | |
| 6 """ | |
| 7 ################################################################################ | |
| 8 # This program will read in a SAINT formatted 'prey.txt' file or a file | |
| 9 # containing a single column list of uniprot accessions (e.g. "P00533" or | |
| 10 # "EGFR_HUMAN")query the CRAPome database (v1.1), and return a file specifying | |
| 11 # the prevalence of each protein in the CRAPome. | |
| 12 ################################################################################ | |
| 13 # Copyright (C) Brent Kuenzi. | |
| 14 # Permission is granted to copy, distribute and/or modify this document | |
| 15 # under the terms of the GNU Free Documentation License, Version 1.3 | |
| 16 # or any later version published by the Free Software Foundation; | |
| 17 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. | |
| 18 # A copy of the license is included in the section entitled "GNU | |
| 19 # Free Documentation License". | |
| 20 ################################################################################ | |
| 21 ## REQUIRED INPUT ## | |
| 22 # 1) crappyData: Prey.txt or single column list of Uniprot accessions | |
| 23 crappyData = sys.argv[1] # Prey file or File with single column of accessions | |
| 24 # 2) Species: HUMAN or YEAST | |
| 25 species = sys.argv[2] # HUMAN or YEAST | |
| 26 db_path = sys.argv[4] | |
| 27 ################################################################################ | |
| 28 ## Dependencies ## | |
| 29 import urllib2 | |
| 30 import sys | |
| 31 import numpy | |
| 32 import os | |
| 33 ################################################################################ | |
| 34 ## Global Variables ## | |
| 35 if species == "HUMAN": | |
| 36 database = "Human_CRAPome_v1-1.txt" | |
| 37 if species == "YEAST": | |
| 38 database = "Yeast_CRAPome_v1-1.txt" | |
| 39 ################################################################################ | |
| 40 ## CRAPomeQuery ## | |
| 41 class ReturnValue1(object): | |
| 42 def __init__(self, uniprot_acc, gene, swissprot): | |
| 43 self.up = uniprot_acc | |
| 44 self.gn = gene | |
| 45 self.sp = swissprot | |
| 46 def get_info(uniprot_accession_in): #get aa lengths and gene name | |
| 47 error = open('error proteins.txt', 'a+') | |
| 48 i=0 | |
| 49 while i==0: | |
| 50 try: | |
| 51 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta") | |
| 52 break | |
| 53 except urllib2.HTTPError, err: | |
| 54 i = i + 1 | |
| 55 if i == 50: | |
| 56 sys.exit("More than 50 errors. Check your file or try again later.") | |
| 57 if err.code == 404: | |
| 58 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n') | |
| 59 seqlength = 'NA' | |
| 60 genename = 'NA' | |
| 61 return ReturnValue1(seqlength, genename) | |
| 62 elif err.code == 302: | |
| 63 sys.exit("Request timed out. Check connection and try again.") | |
| 64 else: | |
| 65 sys.exit("Uniprot had some other error") | |
| 66 lines = data.readlines() | |
| 67 header = lines[0] | |
| 68 lst = header.split('|') | |
| 69 lst2 = lst[2].split(' ') | |
| 70 swissprot = lst2[0] | |
| 71 uniprot_acc = lst[1] | |
| 72 if lines == []: | |
| 73 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n') | |
| 74 error.close | |
| 75 uniprot_acc = 'NA' | |
| 76 genename = 'NA' | |
| 77 return ReturnValue1(uniprot_acc, genename, swissprot) | |
| 78 if lines != []: | |
| 79 seqlength = 0 | |
| 80 header = lines[0] | |
| 81 if 'GN=' in header: | |
| 82 lst = header.split('GN=') | |
| 83 lst2 = lst[1].split(' ') | |
| 84 genename = lst2[0] | |
| 85 error.close | |
| 86 return ReturnValue1(uniprot_acc, genename, swissprot) | |
| 87 if 'GN=' not in header: | |
| 88 genename = 'NA' | |
| 89 error.close | |
| 90 return ReturnValue1(uniprot_acc, genename, swissprot) | |
| 91 def readtab(infile): # read in tab-delim text | |
| 92 with open(infile,'r') as x: | |
| 93 output = [] | |
| 94 for line in x: | |
| 95 line = line.strip() | |
| 96 temp = line.split('\t') | |
| 97 output.append(temp) | |
| 98 return output | |
| 99 def crapome(infile): # Query CRAPome | |
| 100 data = readtab(infile) | |
| 101 crapome = readtab(database) | |
| 102 filt = [] | |
| 103 for i in data: # Filter CRAPome database on our data | |
| 104 flag = 0 # is protein in CRAPome? | |
| 105 ac_flag = 0 # is it _SPECIES or not | |
| 106 unique = 0 # only take first ID in CRAPome | |
| 107 if "_"+species in i[0]: | |
| 108 ac = i[0] | |
| 109 else: | |
| 110 ac = get_info(i[0]).sp # query swissprot if not _SPECIES | |
| 111 ac_flag +=1 | |
| 112 for j in crapome: | |
| 113 if ac == j[2]: | |
| 114 if ac_flag == 0: # if _SPECIES | |
| 115 if unique == 0: | |
| 116 filt.append(j) | |
| 117 flag+=1 | |
| 118 unique+=1 | |
| 119 if ac_flag != 0: # if not _SPECIES | |
| 120 if unique == 0: | |
| 121 unique+=1 | |
| 122 j[2] = i[0] # change to user input | |
| 123 filt.append(j) | |
| 124 flag +=1 | |
| 125 if flag == 0: # if protein is not present in CRAPome database then add it | |
| 126 filt.append(["\t", "\t", i[0], "Invalid identifier / gene not available"]) | |
| 127 total = 0 # Experiment counter | |
| 128 query = [] | |
| 129 for i in filt: # Create CRAPome file as list | |
| 130 temp=[] | |
| 131 if len(i) > 5: | |
| 132 cnt=0 | |
| 133 temp.append(i[2]) # append accession | |
| 134 temp.append(i[0]) # append gene name | |
| 135 ave = [] | |
| 136 total = len(i[3:]) # calculate total experiments | |
| 137 for j in i[3:]: | |
| 138 if j != '0': | |
| 139 ave.append(int(j)) # calculate Ave.SC on only experiments with ID | |
| 140 cnt+=1 | |
| 141 temp.append(str(cnt) + " / "+str(total)) # format ratio | |
| 142 if ave != []: | |
| 143 temp.append(str(round(numpy.mean(ave),1))) # calculate Ave.SC | |
| 144 temp.append(str(max(ave))) # calculate Max.SC | |
| 145 else: | |
| 146 temp.append(0) # add 0 if has not been ID'd in CRAPome | |
| 147 temp.append(0) # add 0 if has not been ID'd in CRAPome | |
| 148 else: | |
| 149 temp.append(i[2]) # append accession | |
| 150 temp.append(i[3]) | |
| 151 query.append(temp) # final query results | |
| 152 | |
| 153 header = ["User Input","Mapped Gene Symbol","Num of Expt. (found/total)","Ave SC","Max SC"] | |
| 154 with open("Crappy Data.txt","wt") as x: # write file | |
| 155 x.write("\t".join(header) + "\n") | |
| 156 for i in query: | |
| 157 x.write("\t".join(i) + "\n") | |
| 158 if __name__ == '__main__': | |
| 159 crapome(crappyData) | |
| 160 | |
| 161 os.rename("Crappy Data.txt", sys.argv[3]) | |
| 162 ## END ## |
