Mercurial > repos > proteore > proteore_data_manager
comparison data_manager/resource_building.py @ 7:b8565596bb25 draft default tip
"planemo upload commit 7afd4b3ee25f024257ccbac6e51076d25b2a04e7"
author | proteore |
---|---|
date | Thu, 20 Aug 2020 03:33:35 -0400 |
parents | 8f33a6e6e36c |
children |
comparison
equal
deleted
inserted
replaced
6:8f33a6e6e36c | 7:b8565596bb25 |
---|---|
552 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" | 552 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" |
553 path = os.path.join(target_directory,output_file) | 553 path = os.path.join(target_directory,output_file) |
554 name = "neXtProt release "+time.strftime("%d-%m-%Y") | 554 name = "neXtProt release "+time.strftime("%d-%m-%Y") |
555 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") | 555 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") |
556 | 556 |
557 output = open(path, 'w') | 557 output = open('test.csv', 'w') |
558 writer = csv.writer(output,delimiter="\t") | 558 writer = csv.writer(output,delimiter="\t") |
559 | 559 |
560 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] | 560 nextprot_file=[["NextprotID","ProteinName","SeqLength","MW","IsoPoint","TMDomains","SubcellLocations","Diseases","Function","PostTranslationalModifications","ProteinFamily","Pathway","ProteinExistence","Chr"]] |
561 writer.writerows(nextprot_file) | 561 writer.writerows(nextprot_file) |
562 | 562 |
563 for id in ids : | 563 for id in ids : |
564 query="https://api.nextprot.org/entry/"+id+".json" | 564 query="https://api.nextprot.org/entry/"+id+".json" |
565 try: | 565 try: |
566 resp = requests.get(url=query) | 566 resp = requests.get(url=query) |
567 except : | 567 except : |
568 print ("wainting 1 hour before trying again") | 568 print ("waiting 15 minutes before trying again") |
569 time.sleep(3600) | 569 time.sleep(900) |
570 resp = requests.get(url=query) | 570 resp = requests.get(url=query) |
571 data = resp.json() | 571 data = resp.json() |
572 | 572 |
573 #get info from json dictionary | 573 #get info from json dictionary |
574 mass_mol = data["entry"]["isoforms"][0]["massAsString"] | 574 mass_mol = data["entry"]["isoforms"][0]["massAsString"] |
575 seq_length = data['entry']["isoforms"][0]["sequenceLength"] | 575 seq_length = data['entry']["isoforms"][0]["sequenceLength"] |
576 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] | 576 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] |
577 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] | 577 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] |
578 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) | 578 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) |
579 protein_name = data['entry']["overview"]['proteinNames'][0]['name'] | |
580 | |
581 #get families description | |
582 if 'families' in data['entry']["overview"] and len(data['entry']["overview"]['families']) > 0: | |
583 families = data['entry']["overview"]['families'] | |
584 families = [entry['description'] for entry in families] | |
585 protein_family = ";".join(families) | |
586 else: | |
587 protein_family = 'NA' | |
588 | |
589 #get Protein function | |
590 if 'function-info' in data['entry']['annotationsByCategory'].keys(): | |
591 function_info = data['entry']['annotationsByCategory']['function-info'] | |
592 function_info = [entry['description'] for entry in function_info if entry['qualityQualifier'] == 'GOLD'] | |
593 function = ';'.join(function_info) | |
594 else : | |
595 function = 'NA' | |
596 | |
597 #Get ptm-info | |
598 post_trans_mod = 'NA' | |
599 if 'ptm-info' in data['entry']['annotationsByCategory'].keys(): | |
600 ptm_info = data['entry']['annotationsByCategory']['ptm-info'] | |
601 infos = [entry['description'] for entry in ptm_info if entry['qualityQualifier'] == 'GOLD'] | |
602 post_trans_mod = ";".join(infos) | |
603 | |
604 #Get pathway(s) | |
605 if 'pathway' in data['entry']['annotationsByCategory'].keys(): | |
606 pathways = data['entry']['annotationsByCategory']['pathway'] | |
607 pathways = [entry['description'] for entry in pathways if entry['qualityQualifier'] == 'GOLD'] | |
608 pathway = ";".join(pathways) | |
609 else : | |
610 pathway = 'NA' | |
579 | 611 |
580 #put all subcell loc in a set | 612 #put all subcell loc in a set |
581 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : | 613 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : |
582 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] | 614 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] |
583 all_subcell_locs = set() | 615 all_subcell_locs = set() |
608 for tm in tm_domains : | 640 for tm in tm_domains : |
609 all_tm_domains.add(tm['cvTermName']) | 641 all_tm_domains.add(tm['cvTermName']) |
610 nb_domains+=1 | 642 nb_domains+=1 |
611 #print "nb domains ++" | 643 #print "nb domains ++" |
612 #print (nb_domains) | 644 #print (nb_domains) |
645 | |
613 nextprot_file[:] = [] | 646 nextprot_file[:] = [] |
614 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) | 647 nextprot_file.append([id,protein_name,str(seq_length),mass_mol,iso_elec_point,str(nb_domains),all_subcell_locs,all_diseases,function,post_trans_mod,protein_family,pathway,protein_existence,chr_loc]) |
615 writer.writerows(nextprot_file) | 648 writer.writerows(nextprot_file) |
616 | 649 |
617 id = str(10000000000 - int(time.strftime("%Y%m%d"))) | 650 id = str(10000000000 - int(time.strftime("%Y%m%d"))) |
618 | 651 |
619 data_table_entry = dict(id=id, release=release_id, name = name, value = path) | 652 data_table_entry = dict(id=id, release=release_id, name = name, value = path) |
620 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") | 653 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") |
621 | 654 |
622 ####################################################################################################### | 655 ####################################################################################################### |