proteore_data_manager: data_manager/resource

comparison data_manager/resource_building.py @ 2:9ec42cb35abd draft

planemo upload commit 339ab77a83db03409c8001324f10b36ff5b13a39-dirty

author	proteore
date	Wed, 19 Jun 2019 04:42:03 -0400
parents	f3507260b30f
children	af0250fd023c

comparison

equal deleted inserted replaced

-:f3507260b30f
+:9ec42cb35abd
 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
 path = os.path.join(target_directory, output_file)
 unzip(url, path)    #download and save file
 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
-tissue_id = tissue_name.replace(" ","_").replace("/","-")
+release = tissue_name.replace(" ","_").replace("/","-")
+id = str(10000000000 - int(time.strftime("%Y%m%d")))
-data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path)
+data_table_entry = dict(id=id, release=release, name = tissue_name, tissue = tissue, value = path)
 _add_data_table_entry(data_manager_dict, data_table_entry, table)
 #######################################################################################################
 # 2. Peptide Atlas
 #header
 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
-#print("header ok")
 #get selected.tab and keep only ids of interest
 selected_tab_file=species_dict[species]+"_"+files[0]
 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
 with gzip.open(tab_path,"rt") as select :
 tab_reader = csv.reader(select,delimiter="\t")
 for line in tab_reader :
 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
 os.remove(tab_path)
 #print("selected_tab ok")
+#get uniprot-AC reviewed
+organism = species_dict[species].split("_")[1]
+query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list"
+with requests.Session() as s:
+download = s.get(query)
+decoded_content = download.content.decode('utf-8')
+uniprot_reviewed_list = decoded_content.splitlines()
+for line in tab[1:]:
+UniProtAC = line[0]
+if UniProtAC not in uniprot_reviewed_list :
+line[0]=""
+line[1]=""
 """
 Supplementary ID to get from HUMAN_9606_idmapping.dat :
 -NextProt,BioGrid,STRING,KEGG
 """
 else :
 line.extend(["","",""])
 #print ("tab ok")
-#add missing nextprot ID for human
+#add missing nextprot ID for human or replace old ones
 if human :
 #build next_dict
 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
 next_dict = {}
 for nextid in nextprot_ids :
 #add missing nextprot ID
 for line in tab[1:] :
 uniprotID=line[0]
 nextprotID=line[13]
-if nextprotID == '' and uniprotID in next_dict :
+if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :
 line[13]=next_dict[uniprotID]
 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
 path = os.path.join(target_directory,output_file)
 w = csv.writer(out,delimiter='\t')
 w.writerows(tab)
 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
-id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
+release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
+id = str(10000000000 - int(time.strftime("%Y%m%d")))    #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order
-data_table_entry = dict(id=id, name = name, species = species, value = path)
+data_table_entry = dict(id=id, release=release , name = name, species = species, value = path)
 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species)
 def download_from_uniprot_ftp(file,target_directory) :
 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
 path = os.path.join(target_directory, file)
 dico['nodes']=dico_nodes
 dico['gene_name']=dico_geneid_to_gene_name
 dico['protein_name']=dico_protein_name
 #writing output
-output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json"
+output_file = species+'_'+interactome+'_'+ time.strftime("%Y-%m-%d") + ".json"
 path = os.path.join(target_directory,output_file)
 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y")
-id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y")
+release = species+"_"+interactome+"_"+ time.strftime("%Y-%m-%d")
+id = str(10000000000 - int(time.strftime("%Y%m%d")))
 with open(path, 'w') as handle:
 json.dump(dico, handle, sort_keys=True)
-data_table_entry = dict(id=id, name = name, species = species, value = path)
+data_table_entry = dict(id=id, release=release, name = name, species = species, value = path)
 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
 #######################################################################################################
 # 5. nextprot (add protein features)
 #######################################################################################################
 def Build_nextprot_ref_file(data_manager_dict,target_directory):
 nextprot_ids_file = "nextprot_ac_list_all.txt"
 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
+output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
+path = os.path.join(target_directory,output_file)
+name = "neXtProt release "+time.strftime("%d-%m-%Y")
+release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
+output = open(path, 'w')
+writer = csv.writer(output,delimiter="\t")
 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
+writer.writerows(nextprot_file)
 for id in ids :
 #print (id)
 query="https://api.nextprot.org/entry/"+id+".json"
 resp = requests.get(url=query)
 data = resp.json()
 tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"]
 all_tm_domains = set()
 for tm in tm_domains :
 all_tm_domains.add(tm['cvTermName'])
 nb_domains+=1
-print "nb domains ++"
+#print "nb domains ++"
-print (nb_domains)
+#print (nb_domains)
+nextprot_file[:] = []
 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
-output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
-path = os.path.join(target_directory,output_file)
-name = "neXtProt release "+time.strftime("%d-%m-%Y")
-id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
-with open(path, 'w') as output:
-writer = csv.writer(output,delimiter="\t")
 writer.writerows(nextprot_file)
-data_table_entry = dict(id=id, name = name, value = path)
+id = str(10000000000 - int(time.strftime("%Y%m%d")))
+data_table_entry = dict(id=id, release=release_id, name = name, value = path)
 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
 #######################################################################################################
 # Main function
 #######################################################################################################

Mercurial > repos > proteore > proteore_data_manager

comparison data_manager/resource_building.py @ 2:9ec42cb35abd draft