comparison data_manager/resource_building.py @ 6:8f33a6e6e36c draft

"planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
author proteore
date Wed, 10 Jun 2020 03:13:18 -0400
parents b05fa99ddda2
children b8565596bb25
comparison
equal deleted inserted replaced
5:b05fa99ddda2 6:8f33a6e6e36c
130 return False 130 return False
131 131
132 ####################################################################################################### 132 #######################################################################################################
133 # 3. ID mapping file 133 # 3. ID mapping file
134 ####################################################################################################### 134 #######################################################################################################
135 import ftplib, gzip 135 import ftplib, gzip
136 from io import StringIO
136 csv.field_size_limit(sys.maxsize) # to handle big files 137 csv.field_size_limit(sys.maxsize) # to handle big files
137 138
138 def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) : 139 def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) :
139 140
140 human = species == "Human" 141 human = species == "Human"
232 #print ("tab ok") 233 #print ("tab ok")
233 234
234 #add missing nextprot ID for human or replace old ones 235 #add missing nextprot ID for human or replace old ones
235 if human : 236 if human :
236 #build next_dict 237 #build next_dict
237 nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) 238 nextprot_path = download_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
238 with open(nextprot_path,'r') as nextprot_ids : 239 with open(nextprot_path,'r') as nextprot_ids :
239 nextprot_ids = nextprot_ids.read().splitlines() 240 nextprot_ids = nextprot_ids.read().splitlines()
240 if os.path.exists(os.path.join(archive,nextprot_path.split("/")[-1])) : os.remove(os.path.join(archive,nextprot_path.split("/")[-1])) 241 if os.path.exists(os.path.join(archive,nextprot_path.split("/")[-1])) : os.remove(os.path.join(archive,nextprot_path.split("/")[-1]))
241 shutil.move(nextprot_path,archive) 242 shutil.move(nextprot_path,archive)
242 next_dict = {} 243 next_dict = {}
277 ftp.cwd(ftp_dir) 278 ftp.cwd(ftp_dir)
278 ftp.retrbinary("RETR " + file, open(path, 'wb').write) 279 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
279 ftp.quit() 280 ftp.quit()
280 return (path) 281 return (path)
281 282
283 def download_from_nextprot_ftp(file,target_directory) :
284 ftp_dir = "pub/current_release/ac_lists/"
285 path = os.path.join(target_directory, file)
286 ftp = ftplib.FTP("ftp.nextprot.org")
287 ftp.login("anonymous", "anonymous")
288 ftp.cwd(ftp_dir)
289 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
290 ftp.quit()
291 return (path)
292
282 def id_list_from_nextprot_ftp(file,target_directory) : 293 def id_list_from_nextprot_ftp(file,target_directory) :
283 ftp_dir = "pub/current_release/ac_lists/" 294 ftp_dir = "pub/current_release/ac_lists/"
284 path = os.path.join(target_directory, file) 295 path = os.path.join(target_directory, file)
285 ftp = ftplib.FTP("ftp.nextprot.org") 296 ftp = ftplib.FTP("ftp.nextprot.org")
286 ftp.login("anonymous", "anonymous") 297 ftp.login("anonymous", "anonymous")
287 ftp.cwd(ftp_dir) 298 ftp.cwd(ftp_dir)
288 ftp.retrbinary("RETR " + file, open(path, 'wb').write) 299 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
289 ftp.quit() 300 ftp.quit()
290 301 with open(path,'r') as nextprot_ids :
291 return (path) 302 nextprot_ids = nextprot_ids.read().splitlines()
303 return (nextprot_ids)
292 304
293 #return '' if there's no value in a dictionary, avoid error 305 #return '' if there's no value in a dictionary, avoid error
294 def access_dictionary (dico,key1,key2) : 306 def access_dictionary (dico,key1,key2) :
295 if key1 in dico : 307 if key1 in dico :
296 if key2 in dico[key1] : 308 if key2 in dico[key1] :
547 559
548 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] 560 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
549 writer.writerows(nextprot_file) 561 writer.writerows(nextprot_file)
550 562
551 for id in ids : 563 for id in ids :
552 #print (id)
553 query="https://api.nextprot.org/entry/"+id+".json" 564 query="https://api.nextprot.org/entry/"+id+".json"
554 resp = requests.get(url=query) 565 try:
566 resp = requests.get(url=query)
567 except :
568 print ("wainting 1 hour before trying again")
569 time.sleep(3600)
570 resp = requests.get(url=query)
555 data = resp.json() 571 data = resp.json()
556 572
557 #get info from json dictionary 573 #get info from json dictionary
558 mass_mol = data["entry"]["isoforms"][0]["massAsString"] 574 mass_mol = data["entry"]["isoforms"][0]["massAsString"]
559 seq_length = data['entry']["isoforms"][0]["sequenceLength"] 575 seq_length = data['entry']["isoforms"][0]["sequenceLength"]