Mercurial > repos > proteore > proteore_data_manager
comparison data_manager/resource_building.py @ 4:e967a99d66b3 draft
"planemo upload commit 540dd383c0617193db43bf11457011888751b022-dirty"
author | proteore |
---|---|
date | Thu, 23 Jan 2020 08:38:02 -0500 |
parents | af0250fd023c |
children | b05fa99ddda2 |
comparison
equal
deleted
inserted
replaced
3:af0250fd023c | 4:e967a99d66b3 |
---|---|
1 # -*- coding: utf-8 -*- | 1 # -*- coding: utf-8 -*- |
2 """ | 2 """ |
3 The purpose of this script is to create source files from different databases to be used in other proteore tools | 3 The purpose of this script is to create source files from different databases to be used in other proteore tools |
4 """ | 4 """ |
5 | 5 |
6 import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile | 6 import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile, subprocess |
7 from io import BytesIO | 7 from io import BytesIO |
8 from zipfile import ZipFile | 8 from zipfile import ZipFile |
9 from galaxy.util.json import from_json_string, to_json_string | 9 from galaxy.util.json import from_json_string, to_json_string |
10 | 10 |
11 ####################################################################################################### | 11 ####################################################################################################### |
129 # 3. ID mapping file | 129 # 3. ID mapping file |
130 ####################################################################################################### | 130 ####################################################################################################### |
131 import ftplib, gzip | 131 import ftplib, gzip |
132 csv.field_size_limit(sys.maxsize) # to handle big files | 132 csv.field_size_limit(sys.maxsize) # to handle big files |
133 | 133 |
134 def id_mapping_sources (data_manager_dict, species, target_directory) : | 134 def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) : |
135 | 135 |
136 human = species == "Human" | 136 human = species == "Human" |
137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } | 137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } |
138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] | 138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] |
139 archive = os.path.join(tool_data_path, "id_mapping/ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d"))) | |
140 if os.path.isdir(archive) is False : os.mkdir(archive) | |
139 | 141 |
140 #header | 142 #header |
141 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] | 143 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']] |
142 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] | 144 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']] |
143 | 145 |
144 #get selected.tab and keep only ids of interest | 146 #get selected.tab and keep only ids of interest |
145 selected_tab_file=species_dict[species]+"_"+files[0] | 147 selected_tab_file=species_dict[species]+"_"+files[0] |
146 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) | 148 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) |
147 with gzip.open(tab_path,"rt") as select : | 149 with gzip.open(tab_path,"rt") as select : |
148 tab_reader = csv.reader(select,delimiter="\t") | 150 tab_reader = csv.reader(select,delimiter="\t") |
149 for line in tab_reader : | 151 for line in tab_reader : |
150 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) | 152 tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) |
151 os.remove(tab_path) | 153 if os.path.exists(os.path.join(archive,tab_path.split("/")[-1])) : os.remove(os.path.join(archive,tab_path.split("/")[-1])) |
152 | 154 shutil.move(tab_path, archive) |
153 #print("selected_tab ok") | 155 #print("selected_tab ok") |
154 | 156 |
155 #get uniprot-AC reviewed | 157 #get uniprot-AC reviewed |
156 organism = species_dict[species].split("_")[1] | 158 organism = species_dict[species].split("_")[1] |
157 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list" | 159 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list" |
159 with requests.Session() as s: | 161 with requests.Session() as s: |
160 download = s.get(query) | 162 download = s.get(query) |
161 decoded_content = download.content.decode('utf-8') | 163 decoded_content = download.content.decode('utf-8') |
162 uniprot_reviewed_list = decoded_content.splitlines() | 164 uniprot_reviewed_list = decoded_content.splitlines() |
163 | 165 |
166 #save reviewed list | |
167 reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt') | |
168 with open(reviewed_list_path,'w') as reviewed_list_file: | |
169 for id in uniprot_reviewed_list: | |
170 reviewed_list_file.write(id+"\n") | |
171 | |
172 #remove unreviewed uniprot-AC | |
164 for line in tab[1:]: | 173 for line in tab[1:]: |
165 UniProtAC = line[0] | 174 UniProtAC = line[1] |
166 if UniProtAC not in uniprot_reviewed_list : | 175 if UniProtAC not in uniprot_reviewed_list : |
167 line[0]="" | |
168 line[1]="" | 176 line[1]="" |
169 | 177 |
170 """ | 178 """ |
171 Supplementary ID to get from HUMAN_9606_idmapping.dat : | 179 Supplementary ID to get from HUMAN_9606_idmapping.dat : |
172 -NextProt,BioGrid,STRING,KEGG | 180 -NextProt,BioGrid,STRING,KEGG |
173 """ | 181 """ |
174 | 182 |
175 #there's more id type for human | 183 #there's more id type for human |
176 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file | 184 if human : ids = ['neXtProt','BioGrid','STRING','KEGG','Gene_Name' ] #ids to get from dat_file |
177 else : ids = ['BioGrid','STRING','KEGG' ] | 185 else : ids = ['BioGrid','STRING','KEGG','Gene_Name' ] |
178 unidict = {} | 186 unidict = {} |
179 | 187 |
180 #keep only ids of interest in dictionaries | 188 #keep only ids of interest in dictionaries |
181 dat_file=species_dict[species]+"_"+files[1] | 189 dat_file = species_dict[species]+"_"+files[1] |
182 dat_path = download_from_uniprot_ftp(dat_file,target_directory) | 190 dat_path = download_from_uniprot_ftp(dat_file,target_directory) |
183 with gzip.open(dat_path,"rt") as dat : | 191 with gzip.open(dat_path,"rt") as dat : |
184 dat_reader = csv.reader(dat,delimiter="\t") | 192 dat_reader = csv.reader(dat,delimiter="\t") |
185 for line in dat_reader : | 193 for line in dat_reader : |
186 uniprotID=line[0] #UniProtID as key | 194 uniprotID=line[0] #UniProtID as key |
192 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary | 200 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary |
193 else : | 201 else : |
194 unidict[uniprotID].update({ id_type : cor_id }) | 202 unidict[uniprotID].update({ id_type : cor_id }) |
195 elif id_type in ids : | 203 elif id_type in ids : |
196 unidict[uniprotID]={id_type : cor_id} | 204 unidict[uniprotID]={id_type : cor_id} |
197 os.remove(dat_path) | 205 if os.path.exists(os.path.join(archive,dat_path.split("/")[-1])) : os.remove(os.path.join(archive,dat_path.split("/")[-1])) |
206 shutil.move(dat_path, archive) | |
198 | 207 |
199 #print("dat_file ok") | 208 #print("dat_file ok") |
200 | 209 |
201 #add ids from idmapping.dat to the final tab | 210 #add ids from idmapping.dat to the final tab |
202 for line in tab[1:] : | 211 for line in tab[1:] : |
204 if human : | 213 if human : |
205 if uniprotID in unidict : | 214 if uniprotID in unidict : |
206 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') | 215 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') |
207 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) | 216 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) |
208 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | 217 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), |
209 access_dictionary(unidict,uniprotID,'KEGG')]) | 218 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')]) |
210 else : | 219 else : |
211 line.extend(["","","",""]) | 220 line.extend(["","","","",""]) |
212 else : | 221 else : |
213 if uniprotID in unidict : | 222 if uniprotID in unidict : |
214 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | 223 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), |
215 access_dictionary(unidict,uniprotID,'KEGG')]) | 224 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')]) |
216 else : | 225 else : |
217 line.extend(["","",""]) | 226 line.extend(["","","",""]) |
218 | 227 |
219 #print ("tab ok") | 228 #print ("tab ok") |
220 | 229 |
221 #add missing nextprot ID for human or replace old ones | 230 #add missing nextprot ID for human or replace old ones |
222 if human : | 231 if human : |
223 #build next_dict | 232 #build next_dict |
224 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) | 233 nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) |
234 with open(nextprot_path,'r') as nextprot_ids : | |
235 nextprot_ids = nextprot_ids.read().splitlines() | |
236 if os.path.exists(os.path.join(archive,nextprot_path.split("/")[-1])) : os.remove(os.path.join(archive,nextprot_path.split("/")[-1])) | |
237 shutil.move(nextprot_path,archive) | |
225 next_dict = {} | 238 next_dict = {} |
226 for nextid in nextprot_ids : | 239 for nextid in nextprot_ids : |
227 next_dict[nextid.replace("NX_","")] = nextid | 240 next_dict[nextid.replace("NX_","")] = nextid |
228 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) | 241 #os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) |
229 | 242 |
230 #add missing nextprot ID | 243 #add missing nextprot ID |
231 for line in tab[1:] : | 244 for line in tab[1:] : |
232 uniprotID=line[0] | 245 uniprotID=line[0] |
233 nextprotID=line[13] | 246 nextprotID=line[14] |
234 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : | 247 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : |
235 line[13]=next_dict[uniprotID] | 248 line[14]=next_dict[uniprotID] |
236 | 249 |
237 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" | 250 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" |
238 path = os.path.join(target_directory,output_file) | 251 path = os.path.join(target_directory,output_file) |
239 | 252 |
240 with open(path,"w") as out : | 253 with open(path,"w") as out : |
241 w = csv.writer(out,delimiter='\t') | 254 w = csv.writer(out,delimiter='\t') |
242 w.writerows(tab) | 255 w.writerows(tab) |
256 | |
257 subprocess.call(['tar', '-czvf', archive+".tar.gz", archive]) | |
258 shutil.rmtree(archive, ignore_errors=True) | |
243 | 259 |
244 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} | 260 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} |
245 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" | 261 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" |
246 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") | 262 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") |
247 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order | 263 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order |
265 ftp = ftplib.FTP("ftp.nextprot.org") | 281 ftp = ftplib.FTP("ftp.nextprot.org") |
266 ftp.login("anonymous", "anonymous") | 282 ftp.login("anonymous", "anonymous") |
267 ftp.cwd(ftp_dir) | 283 ftp.cwd(ftp_dir) |
268 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | 284 ftp.retrbinary("RETR " + file, open(path, 'wb').write) |
269 ftp.quit() | 285 ftp.quit() |
270 with open(path,'r') as nextprot_ids : | 286 |
271 nextprot_ids = nextprot_ids.read().splitlines() | 287 return (path) |
272 return (nextprot_ids) | |
273 | 288 |
274 #return '' if there's no value in a dictionary, avoid error | 289 #return '' if there's no value in a dictionary, avoid error |
275 def access_dictionary (dico,key1,key2) : | 290 def access_dictionary (dico,key1,key2) : |
276 if key1 in dico : | 291 if key1 in dico : |
277 if key2 in dico[key1] : | 292 if key2 in dico[key1] : |
371 | 386 |
372 ##Bioplex | 387 ##Bioplex |
373 elif interactome=="bioplex": | 388 elif interactome=="bioplex": |
374 | 389 |
375 with requests.Session() as s: | 390 with requests.Session() as s: |
376 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv',verify=False) | 391 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv') |
377 r = r.content.decode('utf-8') | 392 r = r.content.decode('utf-8') |
378 bioplex = csv.reader(r.splitlines(), delimiter='\t') | 393 bioplex = csv.reader(r.splitlines(), delimiter='\t') |
379 | 394 |
380 dico_network = {} | 395 dico_network = {} |
381 dico_network["GeneID"]={} | 396 dico_network["GeneID"]={} |
595 parser.add_argument("--interactome", metavar = ("PPI")) | 610 parser.add_argument("--interactome", metavar = ("PPI")) |
596 parser.add_argument("--species") | 611 parser.add_argument("--species") |
597 parser.add_argument("--date") | 612 parser.add_argument("--date") |
598 parser.add_argument("-o", "--output") | 613 parser.add_argument("-o", "--output") |
599 parser.add_argument("--database") | 614 parser.add_argument("--database") |
615 parser.add_argument("--tool_data_path") | |
600 args = parser.parse_args() | 616 args = parser.parse_args() |
601 | 617 |
602 data_manager_dict = {} | 618 data_manager_dict = {} |
603 # Extract json file params | 619 # Extract json file params |
604 filename = args.output | 620 filename = args.output |
629 for pa_tissue in peptide_atlas: | 645 for pa_tissue in peptide_atlas: |
630 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory) | 646 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory) |
631 | 647 |
632 ## Download ID_mapping source file from Uniprot | 648 ## Download ID_mapping source file from Uniprot |
633 try: | 649 try: |
634 id_mapping=args.id_mapping | 650 id_mapping = args.id_mapping |
635 except NameError: | 651 except NameError: |
636 id_mapping = None | 652 id_mapping = None |
637 if id_mapping is not None: | 653 if id_mapping is not None: |
638 id_mapping = id_mapping .split(",") | 654 id_mapping = id_mapping .split(",") |
639 for species in id_mapping : | 655 for species in id_mapping : |
640 id_mapping_sources(data_manager_dict, species, target_directory) | 656 id_mapping_sources(data_manager_dict, species, target_directory, args.tool_data_path) |
641 | 657 |
642 ## Download PPI ref files from biogrid/bioplex/humap | 658 ## Download PPI ref files from biogrid/bioplex/humap |
643 try: | 659 try: |
644 interactome=args.interactome | 660 interactome=args.interactome |
645 if interactome == "biogrid" : | 661 if interactome == "biogrid" : |