Mercurial > repos > proteore > proteore_data_manager

diff data_manager/resource_building.py @ 2:9ec42cb35abd draft
planemo upload commit 339ab77a83db03409c8001324f10b36ff5b13a39-dirty
author: proteore
date: Wed, 19 Jun 2019 04:42:03 -0400
parents: f3507260b30f
children: af0250fd023c
--- a/data_manager/resource_building.py	Tue Apr 16 07:46:59 2019 -0400
+++ b/data_manager/resource_building.py	Wed Jun 19 04:42:03 2019 -0400
@@ -53,10 +53,11 @@
     path = os.path.join(target_directory, output_file)
     unzip(url, path)    #download and save file
     tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
-    tissue_id = tissue_name.replace(" ","_").replace("/","-")
+    release = tissue_name.replace(" ","_").replace("/","-")
+    id = str(10000000000 - int(time.strftime("%Y%m%d")))
 
 
-    data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path)
+    data_table_entry = dict(id=id, release=release, name = tissue_name, tissue = tissue, value = path)
     _add_data_table_entry(data_manager_dict, data_table_entry, table)
 
 
@@ -140,8 +141,6 @@
     if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
     else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
 
-    #print("header ok")
-
     #get selected.tab and keep only ids of interest
     selected_tab_file=species_dict[species]+"_"+files[0]
     tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
@@ -153,6 +152,21 @@
 
     #print("selected_tab ok")
 
+    #get uniprot-AC reviewed
+    organism = species_dict[species].split("_")[1]
+    query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list"
+
+    with requests.Session() as s:
+        download = s.get(query)
+        decoded_content = download.content.decode('utf-8')
+        uniprot_reviewed_list = decoded_content.splitlines()
+
+    for line in tab[1:]:
+        UniProtAC = line[0]
+        if UniProtAC not in uniprot_reviewed_list :
+            line[0]=""
+            line[1]=""
+
     """
     Supplementary ID to get from HUMAN_9606_idmapping.dat :
     -NextProt,BioGrid,STRING,KEGG
@@ -204,7 +218,7 @@
 
     #print ("tab ok")
 
-    #add missing nextprot ID for human
+    #add missing nextprot ID for human or replace old ones
     if human : 
         #build next_dict
         nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
@@ -217,7 +231,7 @@
         for line in tab[1:] : 
             uniprotID=line[0]
             nextprotID=line[13]
-            if nextprotID == '' and uniprotID in next_dict :
+            if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :
                 line[13]=next_dict[uniprotID]
 
     output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
@@ -229,9 +243,10 @@
 
     name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
     name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
-    id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
+    release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
+    id = str(10000000000 - int(time.strftime("%Y%m%d")))    #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order
 
-    data_table_entry = dict(id=id, name = name, species = species, value = path)
+    data_table_entry = dict(id=id, release=release , name = name, species = species, value = path)
     _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species)
 
 def download_from_uniprot_ftp(file,target_directory) :
@@ -483,15 +498,16 @@
         dico['protein_name']=dico_protein_name
 
     #writing output
-    output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json"
+    output_file = species+'_'+interactome+'_'+ time.strftime("%Y-%m-%d") + ".json"
     path = os.path.join(target_directory,output_file)
     name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y")
-    id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y")
+    release = species+"_"+interactome+"_"+ time.strftime("%Y-%m-%d")
+    id = str(10000000000 - int(time.strftime("%Y%m%d")))
 
     with open(path, 'w') as handle:
         json.dump(dico, handle, sort_keys=True)
 
-    data_table_entry = dict(id=id, name = name, species = species, value = path)
+    data_table_entry = dict(id=id, release=release, name = name, species = species, value = path)
     _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
 
 #######################################################################################################
@@ -501,8 +517,18 @@
 def Build_nextprot_ref_file(data_manager_dict,target_directory):
     nextprot_ids_file = "nextprot_ac_list_all.txt"
     ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
-
+    
+    output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
+    path = os.path.join(target_directory,output_file)
+    name = "neXtProt release "+time.strftime("%d-%m-%Y")
+    release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
+    
+    output = open(path, 'w')
+    writer = csv.writer(output,delimiter="\t")
+        
     nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
+    writer.writerows(nextprot_file)
+    
     for id in ids :
         #print (id)
         query="https://api.nextprot.org/entry/"+id+".json"
@@ -547,21 +573,15 @@
             for tm in tm_domains :
                 all_tm_domains.add(tm['cvTermName'])
                 nb_domains+=1
-                print "nb domains ++"
-                print (nb_domains)
-
-    nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
-    
-    output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
-    path = os.path.join(target_directory,output_file)
-    name = "neXtProt release "+time.strftime("%d-%m-%Y")
-    id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
-
-    with open(path, 'w') as output:
-        writer = csv.writer(output,delimiter="\t")
+                #print "nb domains ++"
+                #print (nb_domains)
+        nextprot_file[:] = [] 
+        nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
         writer.writerows(nextprot_file)
 
-    data_table_entry = dict(id=id, name = name, value = path)
+        id = str(10000000000 - int(time.strftime("%Y%m%d")))
+
+    data_table_entry = dict(id=id, release=release_id, name = name, value = path)
     _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
 
 #######################################################################################################
author	proteore
date	Wed, 19 Jun 2019 04:42:03 -0400
parents	f3507260b30f
children	af0250fd023c