Mercurial > repos > proteore > proteore_data_manager
comparison data_manager/resource_building.py @ 2:9ec42cb35abd draft
planemo upload commit 339ab77a83db03409c8001324f10b36ff5b13a39-dirty
author | proteore |
---|---|
date | Wed, 19 Jun 2019 04:42:03 -0400 |
parents | f3507260b30f |
children | af0250fd023c |
comparison
equal
deleted
inserted
replaced
1:f3507260b30f | 2:9ec42cb35abd |
---|---|
51 | 51 |
52 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" | 52 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" |
53 path = os.path.join(target_directory, output_file) | 53 path = os.path.join(target_directory, output_file) |
54 unzip(url, path) #download and save file | 54 unzip(url, path) #download and save file |
55 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") | 55 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") |
56 tissue_id = tissue_name.replace(" ","_").replace("/","-") | 56 release = tissue_name.replace(" ","_").replace("/","-") |
57 | 57 id = str(10000000000 - int(time.strftime("%Y%m%d"))) |
58 | 58 |
59 data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path) | 59 |
60 data_table_entry = dict(id=id, release=release, name = tissue_name, tissue = tissue, value = path) | |
60 _add_data_table_entry(data_manager_dict, data_table_entry, table) | 61 _add_data_table_entry(data_manager_dict, data_table_entry, table) |
61 | 62 |
62 | 63 |
63 ####################################################################################################### | 64 ####################################################################################################### |
64 # 2. Peptide Atlas | 65 # 2. Peptide Atlas |
138 | 139 |
139 #header | 140 #header |
140 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] | 141 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] |
141 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] | 142 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] |
142 | 143 |
143 #print("header ok") | |
144 | |
145 #get selected.tab and keep only ids of interest | 144 #get selected.tab and keep only ids of interest |
146 selected_tab_file=species_dict[species]+"_"+files[0] | 145 selected_tab_file=species_dict[species]+"_"+files[0] |
147 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) | 146 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) |
148 with gzip.open(tab_path,"rt") as select : | 147 with gzip.open(tab_path,"rt") as select : |
149 tab_reader = csv.reader(select,delimiter="\t") | 148 tab_reader = csv.reader(select,delimiter="\t") |
150 for line in tab_reader : | 149 for line in tab_reader : |
151 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) | 150 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) |
152 os.remove(tab_path) | 151 os.remove(tab_path) |
153 | 152 |
154 #print("selected_tab ok") | 153 #print("selected_tab ok") |
154 | |
155 #get uniprot-AC reviewed | |
156 organism = species_dict[species].split("_")[1] | |
157 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list" | |
158 | |
159 with requests.Session() as s: | |
160 download = s.get(query) | |
161 decoded_content = download.content.decode('utf-8') | |
162 uniprot_reviewed_list = decoded_content.splitlines() | |
163 | |
164 for line in tab[1:]: | |
165 UniProtAC = line[0] | |
166 if UniProtAC not in uniprot_reviewed_list : | |
167 line[0]="" | |
168 line[1]="" | |
155 | 169 |
156 """ | 170 """ |
157 Supplementary ID to get from HUMAN_9606_idmapping.dat : | 171 Supplementary ID to get from HUMAN_9606_idmapping.dat : |
158 -NextProt,BioGrid,STRING,KEGG | 172 -NextProt,BioGrid,STRING,KEGG |
159 """ | 173 """ |
202 else : | 216 else : |
203 line.extend(["","",""]) | 217 line.extend(["","",""]) |
204 | 218 |
205 #print ("tab ok") | 219 #print ("tab ok") |
206 | 220 |
207 #add missing nextprot ID for human | 221 #add missing nextprot ID for human or replace old ones |
208 if human : | 222 if human : |
209 #build next_dict | 223 #build next_dict |
210 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) | 224 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) |
211 next_dict = {} | 225 next_dict = {} |
212 for nextid in nextprot_ids : | 226 for nextid in nextprot_ids : |
215 | 229 |
216 #add missing nextprot ID | 230 #add missing nextprot ID |
217 for line in tab[1:] : | 231 for line in tab[1:] : |
218 uniprotID=line[0] | 232 uniprotID=line[0] |
219 nextprotID=line[13] | 233 nextprotID=line[13] |
220 if nextprotID == '' and uniprotID in next_dict : | 234 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : |
221 line[13]=next_dict[uniprotID] | 235 line[13]=next_dict[uniprotID] |
222 | 236 |
223 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" | 237 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" |
224 path = os.path.join(target_directory,output_file) | 238 path = os.path.join(target_directory,output_file) |
225 | 239 |
227 w = csv.writer(out,delimiter='\t') | 241 w = csv.writer(out,delimiter='\t') |
228 w.writerows(tab) | 242 w.writerows(tab) |
229 | 243 |
230 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} | 244 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} |
231 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" | 245 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" |
232 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") | 246 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") |
233 | 247 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order |
234 data_table_entry = dict(id=id, name = name, species = species, value = path) | 248 |
249 data_table_entry = dict(id=id, release=release , name = name, species = species, value = path) | |
235 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) | 250 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) |
236 | 251 |
237 def download_from_uniprot_ftp(file,target_directory) : | 252 def download_from_uniprot_ftp(file,target_directory) : |
238 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" | 253 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" |
239 path = os.path.join(target_directory, file) | 254 path = os.path.join(target_directory, file) |
481 dico['nodes']=dico_nodes | 496 dico['nodes']=dico_nodes |
482 dico['gene_name']=dico_geneid_to_gene_name | 497 dico['gene_name']=dico_geneid_to_gene_name |
483 dico['protein_name']=dico_protein_name | 498 dico['protein_name']=dico_protein_name |
484 | 499 |
485 #writing output | 500 #writing output |
486 output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json" | 501 output_file = species+'_'+interactome+'_'+ time.strftime("%Y-%m-%d") + ".json" |
487 path = os.path.join(target_directory,output_file) | 502 path = os.path.join(target_directory,output_file) |
488 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") | 503 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") |
489 id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y") | 504 release = species+"_"+interactome+"_"+ time.strftime("%Y-%m-%d") |
505 id = str(10000000000 - int(time.strftime("%Y%m%d"))) | |
490 | 506 |
491 with open(path, 'w') as handle: | 507 with open(path, 'w') as handle: |
492 json.dump(dico, handle, sort_keys=True) | 508 json.dump(dico, handle, sort_keys=True) |
493 | 509 |
494 data_table_entry = dict(id=id, name = name, species = species, value = path) | 510 data_table_entry = dict(id=id, release=release, name = name, species = species, value = path) |
495 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") | 511 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") |
496 | 512 |
497 ####################################################################################################### | 513 ####################################################################################################### |
498 # 5. nextprot (add protein features) | 514 # 5. nextprot (add protein features) |
499 ####################################################################################################### | 515 ####################################################################################################### |
500 | 516 |
501 def Build_nextprot_ref_file(data_manager_dict,target_directory): | 517 def Build_nextprot_ref_file(data_manager_dict,target_directory): |
502 nextprot_ids_file = "nextprot_ac_list_all.txt" | 518 nextprot_ids_file = "nextprot_ac_list_all.txt" |
503 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) | 519 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) |
504 | 520 |
521 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" | |
522 path = os.path.join(target_directory,output_file) | |
523 name = "neXtProt release "+time.strftime("%d-%m-%Y") | |
524 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") | |
525 | |
526 output = open(path, 'w') | |
527 writer = csv.writer(output,delimiter="\t") | |
528 | |
505 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] | 529 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] |
530 writer.writerows(nextprot_file) | |
531 | |
506 for id in ids : | 532 for id in ids : |
507 #print (id) | 533 #print (id) |
508 query="https://api.nextprot.org/entry/"+id+".json" | 534 query="https://api.nextprot.org/entry/"+id+".json" |
509 resp = requests.get(url=query) | 535 resp = requests.get(url=query) |
510 data = resp.json() | 536 data = resp.json() |
545 tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"] | 571 tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"] |
546 all_tm_domains = set() | 572 all_tm_domains = set() |
547 for tm in tm_domains : | 573 for tm in tm_domains : |
548 all_tm_domains.add(tm['cvTermName']) | 574 all_tm_domains.add(tm['cvTermName']) |
549 nb_domains+=1 | 575 nb_domains+=1 |
550 print "nb domains ++" | 576 #print "nb domains ++" |
551 print (nb_domains) | 577 #print (nb_domains) |
552 | 578 nextprot_file[:] = [] |
553 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) | 579 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) |
554 | |
555 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" | |
556 path = os.path.join(target_directory,output_file) | |
557 name = "neXtProt release "+time.strftime("%d-%m-%Y") | |
558 id = "nextprot_ref_"+time.strftime("%d-%m-%Y") | |
559 | |
560 with open(path, 'w') as output: | |
561 writer = csv.writer(output,delimiter="\t") | |
562 writer.writerows(nextprot_file) | 580 writer.writerows(nextprot_file) |
563 | 581 |
564 data_table_entry = dict(id=id, name = name, value = path) | 582 id = str(10000000000 - int(time.strftime("%Y%m%d"))) |
583 | |
584 data_table_entry = dict(id=id, release=release_id, name = name, value = path) | |
565 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") | 585 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") |
566 | 586 |
567 ####################################################################################################### | 587 ####################################################################################################### |
568 # Main function | 588 # Main function |
569 ####################################################################################################### | 589 ####################################################################################################### |