Mercurial > repos > proteore > proteore_data_manager
diff data_manager/resource_building.py @ 2:9ec42cb35abd draft
planemo upload commit 339ab77a83db03409c8001324f10b36ff5b13a39-dirty
author | proteore |
---|---|
date | Wed, 19 Jun 2019 04:42:03 -0400 |
parents | f3507260b30f |
children | af0250fd023c |
line wrap: on
line diff
--- a/data_manager/resource_building.py Tue Apr 16 07:46:59 2019 -0400 +++ b/data_manager/resource_building.py Wed Jun 19 04:42:03 2019 -0400 @@ -53,10 +53,11 @@ path = os.path.join(target_directory, output_file) unzip(url, path) #download and save file tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") - tissue_id = tissue_name.replace(" ","_").replace("/","-") + release = tissue_name.replace(" ","_").replace("/","-") + id = str(10000000000 - int(time.strftime("%Y%m%d"))) - data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path) + data_table_entry = dict(id=id, release=release, name = tissue_name, tissue = tissue, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, table) @@ -140,8 +141,6 @@ if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] - #print("header ok") - #get selected.tab and keep only ids of interest selected_tab_file=species_dict[species]+"_"+files[0] tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) @@ -153,6 +152,21 @@ #print("selected_tab ok") + #get uniprot-AC reviewed + organism = species_dict[species].split("_")[1] + query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list" + + with requests.Session() as s: + download = s.get(query) + decoded_content = download.content.decode('utf-8') + uniprot_reviewed_list = decoded_content.splitlines() + + for line in tab[1:]: + UniProtAC = line[0] + if UniProtAC not in uniprot_reviewed_list : + line[0]="" + line[1]="" + """ Supplementary ID to get from HUMAN_9606_idmapping.dat : -NextProt,BioGrid,STRING,KEGG @@ -204,7 +218,7 @@ #print ("tab ok") - #add missing nextprot ID for human + #add missing nextprot ID for human or replace old ones if human : #build next_dict nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) @@ -217,7 +231,7 @@ for line in tab[1:] : uniprotID=line[0] nextprotID=line[13] - if nextprotID == '' and uniprotID in next_dict : + if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : line[13]=next_dict[uniprotID] output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" @@ -229,9 +243,10 @@ name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" - id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order - data_table_entry = dict(id=id, name = name, species = species, value = path) + data_table_entry = dict(id=id, release=release , name = name, species = species, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) def download_from_uniprot_ftp(file,target_directory) : @@ -483,15 +498,16 @@ dico['protein_name']=dico_protein_name #writing output - output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json" + output_file = species+'_'+interactome+'_'+ time.strftime("%Y-%m-%d") + ".json" path = os.path.join(target_directory,output_file) name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") - id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y") + release = species+"_"+interactome+"_"+ time.strftime("%Y-%m-%d") + id = str(10000000000 - int(time.strftime("%Y%m%d"))) with open(path, 'w') as handle: json.dump(dico, handle, sort_keys=True) - data_table_entry = dict(id=id, name = name, species = species, value = path) + data_table_entry = dict(id=id, release=release, name = name, species = species, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") ####################################################################################################### @@ -501,8 +517,18 @@ def Build_nextprot_ref_file(data_manager_dict,target_directory): nextprot_ids_file = "nextprot_ac_list_all.txt" ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) - + + output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" + path = os.path.join(target_directory,output_file) + name = "neXtProt release "+time.strftime("%d-%m-%Y") + release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") + + output = open(path, 'w') + writer = csv.writer(output,delimiter="\t") + nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] + writer.writerows(nextprot_file) + for id in ids : #print (id) query="https://api.nextprot.org/entry/"+id+".json" @@ -547,21 +573,15 @@ for tm in tm_domains : all_tm_domains.add(tm['cvTermName']) nb_domains+=1 - print "nb domains ++" - print (nb_domains) - - nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) - - output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" - path = os.path.join(target_directory,output_file) - name = "neXtProt release "+time.strftime("%d-%m-%Y") - id = "nextprot_ref_"+time.strftime("%d-%m-%Y") - - with open(path, 'w') as output: - writer = csv.writer(output,delimiter="\t") + #print "nb domains ++" + #print (nb_domains) + nextprot_file[:] = [] + nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) writer.writerows(nextprot_file) - data_table_entry = dict(id=id, name = name, value = path) + id = str(10000000000 - int(time.strftime("%Y%m%d"))) + + data_table_entry = dict(id=id, release=release_id, name = name, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") #######################################################################################################