proteore_data_manager: data_manager/resource

comparison data_manager/resource_building.py @ 4:e967a99d66b3 draft

"planemo upload commit 540dd383c0617193db43bf11457011888751b022-dirty"

author	proteore
date	Thu, 23 Jan 2020 08:38:02 -0500
parents	af0250fd023c
children	b05fa99ddda2

comparison

equal deleted inserted replaced

-:af0250fd023c
+:e967a99d66b3
 # -*- coding: utf-8 -*-
 """
 The purpose of this script is to create source files from different databases to be used in other proteore tools
 """
-import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile
+import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile, subprocess
 from io import BytesIO
 from zipfile import ZipFile
 from galaxy.util.json import from_json_string, to_json_string
 #######################################################################################################
 # 3. ID mapping file
 #######################################################################################################
 import ftplib, gzip
 csv.field_size_limit(sys.maxsize) # to handle big files
-def id_mapping_sources (data_manager_dict, species, target_directory) :
+def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) :
 human = species == "Human"
 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
+archive = os.path.join(tool_data_path, "id_mapping/ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d")))
+if os.path.isdir(archive) is False : os.mkdir(archive)
 #header
-if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
+if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']]
-else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
+else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']]
 #get selected.tab and keep only ids of interest
 selected_tab_file=species_dict[species]+"_"+files[0]
 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
 with gzip.open(tab_path,"rt") as select :
 tab_reader = csv.reader(select,delimiter="\t")
 for line in tab_reader :
-tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
+tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
-os.remove(tab_path)
+if os.path.exists(os.path.join(archive,tab_path.split("/")[-1])) : os.remove(os.path.join(archive,tab_path.split("/")[-1]))
+shutil.move(tab_path, archive)
 #print("selected_tab ok")
 #get uniprot-AC reviewed
 organism = species_dict[species].split("_")[1]
 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list"
 with requests.Session() as s:
 download = s.get(query)
 decoded_content = download.content.decode('utf-8')
 uniprot_reviewed_list = decoded_content.splitlines()
+#save reviewed list
+reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt')
+with open(reviewed_list_path,'w') as reviewed_list_file:
+for id in uniprot_reviewed_list:
+reviewed_list_file.write(id+"\n")
+#remove unreviewed uniprot-AC
 for line in tab[1:]:
-UniProtAC = line[0]
+UniProtAC = line[1]
 if UniProtAC not in uniprot_reviewed_list :
-line[0]=""
 line[1]=""
 """
 Supplementary ID to get from HUMAN_9606_idmapping.dat :
 -NextProt,BioGrid,STRING,KEGG
 """
 #there's more id type for human
-if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ]   #ids to get from dat_file
+if human : ids = ['neXtProt','BioGrid','STRING','KEGG','Gene_Name' ]   #ids to get from dat_file
-else : ids = ['BioGrid','STRING','KEGG' ]
+else : ids = ['BioGrid','STRING','KEGG','Gene_Name' ]
 unidict = {}
 #keep only ids of interest in dictionaries
-dat_file=species_dict[species]+"_"+files[1]
+dat_file = species_dict[species]+"_"+files[1]
 dat_path = download_from_uniprot_ftp(dat_file,target_directory)
 with gzip.open(dat_path,"rt") as dat :
 dat_reader = csv.reader(dat,delimiter="\t")
 for line in dat_reader :
 uniprotID=line[0]       #UniProtID as key
 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id])    #if there is already a value in the dictionnary
 else :
 unidict[uniprotID].update({ id_type : cor_id })
 elif  id_type in ids :
 unidict[uniprotID]={id_type : cor_id}
-os.remove(dat_path)
+if os.path.exists(os.path.join(archive,dat_path.split("/")[-1])) : os.remove(os.path.join(archive,dat_path.split("/")[-1]))
+shutil.move(dat_path, archive)
 #print("dat_file ok")
 #add ids from idmapping.dat to the final tab
 for line in tab[1:] :
 if human :
 if uniprotID in unidict :
 nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
-access_dictionary(unidict,uniprotID,'KEGG')])
+access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')])
 else :
-line.extend(["","","",""])
+line.extend(["","","","",""])
 else :
 if uniprotID in unidict :
 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
-access_dictionary(unidict,uniprotID,'KEGG')])
+access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')])
 else :
-line.extend(["","",""])
+line.extend(["","","",""])
 #print ("tab ok")
 #add missing nextprot ID for human or replace old ones
 if human :
 #build next_dict
-nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
+nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
+with open(nextprot_path,'r') as nextprot_ids :
+nextprot_ids = nextprot_ids.read().splitlines()
+if os.path.exists(os.path.join(archive,nextprot_path.split("/")[-1])) : os.remove(os.path.join(archive,nextprot_path.split("/")[-1]))
+shutil.move(nextprot_path,archive)
 next_dict = {}
 for nextid in nextprot_ids :
 next_dict[nextid.replace("NX_","")] = nextid
-os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
+#os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
 #add missing nextprot ID
 for line in tab[1:] :
 uniprotID=line[0]
-nextprotID=line[13]
+nextprotID=line[14]
 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :
-line[13]=next_dict[uniprotID]
+line[14]=next_dict[uniprotID]
 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
 path = os.path.join(target_directory,output_file)
 with open(path,"w") as out :
 w = csv.writer(out,delimiter='\t')
 w.writerows(tab)
+subprocess.call(['tar', '-czvf', archive+".tar.gz", archive])
+shutil.rmtree(archive, ignore_errors=True)
 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
 id = str(10000000000 - int(time.strftime("%Y%m%d")))    #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order
 ftp = ftplib.FTP("ftp.nextprot.org")
 ftp.login("anonymous", "anonymous")
 ftp.cwd(ftp_dir)
 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
 ftp.quit()
-with open(path,'r') as nextprot_ids :
-nextprot_ids = nextprot_ids.read().splitlines()
+return (path)
-return (nextprot_ids)
 #return '' if there's no value in a dictionary, avoid error
 def access_dictionary (dico,key1,key2) :
 if key1 in dico :
 if key2 in dico[key1] :
 ##Bioplex
 elif interactome=="bioplex":
 with requests.Session() as s:
-r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv',verify=False)
+r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv')
 r = r.content.decode('utf-8')
 bioplex = csv.reader(r.splitlines(), delimiter='\t')
 dico_network = {}
 dico_network["GeneID"]={}
 parser.add_argument("--interactome", metavar = ("PPI"))
 parser.add_argument("--species")
 parser.add_argument("--date")
 parser.add_argument("-o", "--output")
 parser.add_argument("--database")
+parser.add_argument("--tool_data_path")
 args = parser.parse_args()
 data_manager_dict = {}
 # Extract json file params
 filename = args.output
 for pa_tissue in peptide_atlas:
 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory)
 ## Download ID_mapping source file from Uniprot
 try:
-id_mapping=args.id_mapping
+id_mapping = args.id_mapping
 except NameError:
 id_mapping = None
 if id_mapping is not None:
 id_mapping = id_mapping .split(",")
 for species in id_mapping :
-id_mapping_sources(data_manager_dict, species, target_directory)
+id_mapping_sources(data_manager_dict, species, target_directory, args.tool_data_path)
 ## Download PPI ref files from biogrid/bioplex/humap
 try:
 interactome=args.interactome
 if interactome == "biogrid" :

Mercurial > repos > proteore > proteore_data_manager

comparison data_manager/resource_building.py @ 4:e967a99d66b3 draft