Mercurial > repos > proteore > proteore_data_manager
comparison data_manager/resource_building.py @ 0:9e31ea9fc7ea draft
planemo upload commit 567ba7934c0ca55529dfeb5e7ca0935ace260ad7-dirty
author | proteore |
---|---|
date | Wed, 13 Mar 2019 06:30:42 -0400 |
parents | |
children | f3507260b30f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9e31ea9fc7ea |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 The purpose of this script is to create source files from different databases to be used in other proteore tools | |
4 """ | |
5 | |
6 import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile | |
7 from io import BytesIO | |
8 from zipfile import ZipFile | |
9 from galaxy.util.json import from_json_string, to_json_string | |
10 | |
11 ####################################################################################################### | |
12 # General functions | |
13 ####################################################################################################### | |
14 def unzip(url, output_file): | |
15 """ | |
16 Get a zip file content from a link and unzip | |
17 """ | |
18 content = requests.get(url) | |
19 zipfile = ZipFile(BytesIO(content.content)) | |
20 output_content = "" | |
21 output_content += zipfile.open(zipfile.namelist()[0]).read() | |
22 output = open(output_file, "w") | |
23 output.write(output_content) | |
24 output.close() | |
25 | |
26 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): | |
27 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) | |
28 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) | |
29 data_manager_dict['data_tables'][data_table].append(data_table_entry) | |
30 return data_manager_dict | |
31 | |
32 ####################################################################################################### | |
33 # 1. Human Protein Atlas | |
34 # - Normal tissue | |
35 # - Pathology | |
36 # - Full Atlas | |
37 ####################################################################################################### | |
38 def HPA_sources(data_manager_dict, tissue, target_directory): | |
39 if tissue == "HPA_normal_tissue": | |
40 tissue_name = "HPA normal tissue" | |
41 url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip" | |
42 table = "proteore_protein_atlas_normal_tissue" | |
43 elif tissue == "HPA_pathology": | |
44 tissue_name = "HPA pathology" | |
45 url = "https://www.proteinatlas.org/download/pathology.tsv.zip" | |
46 table = "proteore_protein_atlas_tumor_tissue" | |
47 elif tissue == "HPA_full_atlas": | |
48 tissue_name = "HPA full atlas" | |
49 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip" | |
50 table = "proteore_protein_full_atlas" | |
51 | |
52 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" | |
53 path = os.path.join(target_directory, output_file) | |
54 unzip(url, path) #download and save file | |
55 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") | |
56 tissue_id = tissue_name.replace(" ","_").replace("/","-") | |
57 | |
58 | |
59 data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path) | |
60 _add_data_table_entry(data_manager_dict, data_table_entry, table) | |
61 | |
62 | |
63 ####################################################################################################### | |
64 # 2. Peptide Atlas | |
65 ####################################################################################################### | |
66 def peptide_atlas_sources(data_manager_dict, tissue, date, target_directory): | |
67 # Define organism_id (here Human) - to be upraded when other organism added to the project | |
68 organism_id = "2" | |
69 # Extract sample_category_id and output filename | |
70 tissue=tissue.split(".") | |
71 sample_category_id = tissue[0] | |
72 tissue_name = tissue[1] | |
73 output_file = tissue_name+"_"+date + ".tsv" | |
74 | |
75 query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+ \ | |
76 sample_category_id+"&display_options=ShowAbundances&organism_id="+organism_id+ \ | |
77 "&redundancy_constraint=4&presence_level_constraint=1%2C2&gene_annotation_level_constraint=leaf\ | |
78 &QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY" | |
79 | |
80 with requests.Session() as s: | |
81 download = s.get(query) | |
82 decoded_content = download.content.decode('utf-8') | |
83 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') | |
84 | |
85 uni_dict = build_dictionary(cr) | |
86 | |
87 #columns of data table peptide_atlas | |
88 tissue_id = tissue_name+"_"+date | |
89 name = tissue_id.replace("-","/").replace("_"," ") | |
90 path = os.path.join(target_directory,output_file) | |
91 | |
92 with open(path,"w") as out : | |
93 w = csv.writer(out,delimiter='\t') | |
94 w.writerow(["Uniprot_AC","nb_obs"]) | |
95 w.writerows(uni_dict.items()) | |
96 | |
97 data_table_entry = dict(id=tissue_id, name=name, value = path, tissue = tissue_name) | |
98 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_peptide_atlas") | |
99 | |
100 #function to count the number of observations by uniprot id | |
101 def build_dictionary (csv) : | |
102 uni_dict = {} | |
103 for line in csv : | |
104 if "-" not in line[0] and check_uniprot_access(line[0]) : | |
105 if line[0] in uni_dict : | |
106 uni_dict[line[0]] += int(line[5]) | |
107 else : | |
108 uni_dict[line[0]] = int(line[5]) | |
109 | |
110 return uni_dict | |
111 | |
112 #function to check if an id is an uniprot accession number : return True or False- | |
113 def check_uniprot_access (id) : | |
114 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | |
115 if uniprot_pattern.match(id) : | |
116 return True | |
117 else : | |
118 return False | |
119 | |
120 def check_entrez_geneid (id) : | |
121 entrez_pattern = re.compile("[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+") | |
122 if entrez_pattern.match(id) : | |
123 return True | |
124 else : | |
125 return False | |
126 | |
127 ####################################################################################################### | |
128 # 3. ID mapping file | |
129 ####################################################################################################### | |
130 import ftplib, gzip | |
131 csv.field_size_limit(sys.maxsize) # to handle big files | |
132 | |
133 def id_mapping_sources (data_manager_dict, species, target_directory) : | |
134 | |
135 human = species == "Human" | |
136 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } | |
137 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] | |
138 | |
139 #header | |
140 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] | |
141 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] | |
142 | |
143 #print("header ok") | |
144 | |
145 #get selected.tab and keep only ids of interest | |
146 selected_tab_file=species_dict[species]+"_"+files[0] | |
147 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) | |
148 with gzip.open(tab_path,"rt") as select : | |
149 tab_reader = csv.reader(select,delimiter="\t") | |
150 for line in tab_reader : | |
151 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) | |
152 os.remove(tab_path) | |
153 | |
154 #print("selected_tab ok") | |
155 | |
156 """ | |
157 Supplementary ID to get from HUMAN_9606_idmapping.dat : | |
158 -NextProt,BioGrid,STRING,KEGG | |
159 """ | |
160 | |
161 #there's more id type for human | |
162 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file | |
163 else : ids = ['BioGrid','STRING','KEGG' ] | |
164 unidict = {} | |
165 | |
166 #keep only ids of interest in dictionaries | |
167 dat_file=species_dict[species]+"_"+files[1] | |
168 dat_path = download_from_uniprot_ftp(dat_file,target_directory) | |
169 with gzip.open(dat_path,"rt") as dat : | |
170 dat_reader = csv.reader(dat,delimiter="\t") | |
171 for line in dat_reader : | |
172 uniprotID=line[0] #UniProtID as key | |
173 id_type=line[1] #ID type of corresponding id, key of sub-dictionnary | |
174 cor_id=line[2] #corresponding id | |
175 if "-" not in id_type : #we don't keep isoform | |
176 if id_type in ids and uniprotID in unidict : | |
177 if id_type in unidict[uniprotID] : | |
178 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary | |
179 else : | |
180 unidict[uniprotID].update({ id_type : cor_id }) | |
181 elif id_type in ids : | |
182 unidict[uniprotID]={id_type : cor_id} | |
183 os.remove(dat_path) | |
184 | |
185 #print("dat_file ok") | |
186 | |
187 #add ids from idmapping.dat to the final tab | |
188 for line in tab[1:] : | |
189 uniprotID=line[0] | |
190 if human : | |
191 if uniprotID in unidict : | |
192 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') | |
193 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) | |
194 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | |
195 access_dictionary(unidict,uniprotID,'KEGG')]) | |
196 else : | |
197 line.extend(["","","",""]) | |
198 else : | |
199 if uniprotID in unidict : | |
200 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | |
201 access_dictionary(unidict,uniprotID,'KEGG')]) | |
202 else : | |
203 line.extend(["","",""]) | |
204 | |
205 #print ("tab ok") | |
206 | |
207 #add missing nextprot ID for human | |
208 if human : | |
209 #build next_dict | |
210 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) | |
211 next_dict = {} | |
212 for nextid in nextprot_ids : | |
213 next_dict[nextid.replace("NX_","")] = nextid | |
214 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) | |
215 | |
216 #add missing nextprot ID | |
217 for line in tab[1:] : | |
218 uniprotID=line[0] | |
219 nextprotID=line[13] | |
220 if nextprotID == '' and uniprotID in next_dict : | |
221 line[13]=next_dict[uniprotID] | |
222 | |
223 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" | |
224 path = os.path.join(target_directory,output_file) | |
225 | |
226 with open(path,"w") as out : | |
227 w = csv.writer(out,delimiter='\t') | |
228 w.writerows(tab) | |
229 | |
230 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} | |
231 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" | |
232 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") | |
233 | |
234 data_table_entry = dict(id=id, name = name, species = species, value = path) | |
235 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) | |
236 | |
237 def download_from_uniprot_ftp(file,target_directory) : | |
238 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" | |
239 path = os.path.join(target_directory, file) | |
240 ftp = ftplib.FTP("ftp.uniprot.org") | |
241 ftp.login("anonymous", "anonymous") | |
242 ftp.cwd(ftp_dir) | |
243 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | |
244 ftp.quit() | |
245 return (path) | |
246 | |
247 def id_list_from_nextprot_ftp(file,target_directory) : | |
248 ftp_dir = "pub/current_release/ac_lists/" | |
249 path = os.path.join(target_directory, file) | |
250 ftp = ftplib.FTP("ftp.nextprot.org") | |
251 ftp.login("anonymous", "anonymous") | |
252 ftp.cwd(ftp_dir) | |
253 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | |
254 ftp.quit() | |
255 with open(path,'r') as nextprot_ids : | |
256 nextprot_ids = nextprot_ids.read().splitlines() | |
257 return (nextprot_ids) | |
258 | |
259 #return '' if there's no value in a dictionary, avoid error | |
260 def access_dictionary (dico,key1,key2) : | |
261 if key1 in dico : | |
262 if key2 in dico[key1] : | |
263 return (dico[key1][key2]) | |
264 else : | |
265 return ("") | |
266 #print (key2,"not in ",dico,"[",key1,"]") | |
267 else : | |
268 return ('') | |
269 | |
270 #if there are several nextprot ID for one uniprotID, return the uniprot like ID | |
271 def clean_nextprot_id (next_id,uniprotAc) : | |
272 if len(next_id.split(";")) > 1 : | |
273 tmp = next_id.split(";") | |
274 if "NX_"+uniprotAc in tmp : | |
275 return ("NX_"+uniprotAc) | |
276 else : | |
277 return (tmp[1]) | |
278 else : | |
279 return (next_id) | |
280 | |
281 | |
282 ####################################################################################################### | |
283 # 4. Build protein interaction maps files | |
284 ####################################################################################################### | |
285 | |
286 def get_interactant_name(line,dico): | |
287 | |
288 if line[0] in dico : | |
289 interactant_A = dico[line[0]] | |
290 else : | |
291 interactant_A = "NA" | |
292 | |
293 if line[1] in dico : | |
294 interactant_B = dico[line[1]] | |
295 else : | |
296 interactant_B = "NA" | |
297 | |
298 return interactant_A, interactant_B | |
299 | |
300 def PPI_ref_files(data_manager_dict, species, interactome, target_directory): | |
301 | |
302 species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"} | |
303 | |
304 ##BioGRID | |
305 if interactome=="biogrid": | |
306 | |
307 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip" | |
308 | |
309 #download zip file | |
310 r = requests.get(tab2_link) | |
311 with open("BioGRID.zip", "wb") as code: | |
312 code.write(r.content) | |
313 | |
314 #unzip files | |
315 with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref: | |
316 if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID") | |
317 zip_ref.extractall("tmp_BioGRID") | |
318 | |
319 #import file of interest and build dictionary | |
320 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt" | |
321 with open(file_path,"r") as handle : | |
322 tab_file = csv.reader(handle,delimiter="\t") | |
323 dico_network = {} | |
324 GeneID_index=1 | |
325 network_cols=[1,2,7,8,11,12,14,18,20] | |
326 for line in tab_file : | |
327 if line[GeneID_index] not in dico_network: | |
328 dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]] | |
329 else: | |
330 dico_network[line[GeneID_index]].append([line[i] for i in network_cols]) | |
331 | |
332 #delete tmp_BioGRID directory | |
333 os.remove("BioGRID.zip") | |
334 shutil.rmtree("tmp_BioGRID", ignore_errors=True) | |
335 | |
336 #download NCBI2Reactome.txt file and build dictionary | |
337 with requests.Session() as s: | |
338 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | |
339 r.encoding ="utf-8" | |
340 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | |
341 | |
342 dico_nodes = {} | |
343 geneid_index=0 | |
344 pathway_description_index=3 | |
345 species_index=5 | |
346 for line in tab_file : | |
347 if line[species_index]==species_dict[species]: | |
348 if line[geneid_index] in dico_nodes : | |
349 dico_nodes[line[geneid_index]].append(line[pathway_description_index]) | |
350 else : | |
351 dico_nodes[line[geneid_index]] = [line[pathway_description_index]] | |
352 | |
353 dico={} | |
354 dico['network']=dico_network | |
355 dico['nodes']=dico_nodes | |
356 | |
357 ##Bioplex | |
358 elif interactome=="bioplex": | |
359 | |
360 with requests.Session() as s: | |
361 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv') | |
362 r = r.content.decode('utf-8') | |
363 bioplex = csv.reader(r.splitlines(), delimiter='\t') | |
364 | |
365 dico_network = {} | |
366 dico_network["GeneID"]={} | |
367 network_geneid_cols=[0,1,4,5,8] | |
368 dico_network["UniProt-AC"]={} | |
369 network_uniprot_cols=[2,3,4,5,8] | |
370 dico_GeneID_to_UniProt = {} | |
371 for line in bioplex : | |
372 if line[0] not in dico_network["GeneID"]: | |
373 dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]] | |
374 else : | |
375 dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols]) | |
376 if line[1] not in dico_network["UniProt-AC"]: | |
377 dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]] | |
378 else: | |
379 dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols]) | |
380 dico_GeneID_to_UniProt[line[0]]=line[2] | |
381 | |
382 with requests.Session() as s: | |
383 r = s.get('https://reactome.org/download/current/UniProt2Reactome.txt') | |
384 r.encoding ="utf-8" | |
385 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | |
386 | |
387 dico_nodes_uniprot = {} | |
388 uniProt_index=0 | |
389 pathway_description_index=3 | |
390 species_index=5 | |
391 for line in tab_file : | |
392 if line[species_index]==species_dict[species]: | |
393 if line[uniProt_index] in dico_nodes_uniprot : | |
394 dico_nodes_uniprot[line[uniProt_index]].append(line[pathway_description_index]) | |
395 else : | |
396 dico_nodes_uniprot[line[uniProt_index]] = [line[pathway_description_index]] | |
397 | |
398 with requests.Session() as s: | |
399 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | |
400 r.encoding ="utf-8" | |
401 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | |
402 | |
403 dico_nodes_geneid = {} | |
404 geneid_index=0 | |
405 pathway_description_index=3 | |
406 species_index=5 | |
407 for line in tab_file : | |
408 if line[species_index]==species_dict[species]: | |
409 if line[geneid_index] in dico_nodes_geneid : | |
410 dico_nodes_geneid[line[geneid_index]].append(line[pathway_description_index]) | |
411 else : | |
412 dico_nodes_geneid[line[geneid_index]] = [line[pathway_description_index]] | |
413 | |
414 dico={} | |
415 dico_nodes={} | |
416 dico_nodes['GeneID']=dico_nodes_geneid | |
417 dico_nodes['UniProt-AC']=dico_nodes_uniprot | |
418 dico['network']=dico_network | |
419 dico['nodes']=dico_nodes | |
420 dico['convert']=dico_GeneID_to_UniProt | |
421 | |
422 ##Humap | |
423 elif interactome=="humap": | |
424 | |
425 with requests.Session() as s: | |
426 r = s.get('http://proteincomplexes.org/static/downloads/nodeTable.txt') | |
427 r = r.content.decode('utf-8') | |
428 humap_nodes = csv.reader(r.splitlines(), delimiter=',') | |
429 | |
430 dico_geneid_to_gene_name={} | |
431 dico_protein_name={} | |
432 for line in humap_nodes : | |
433 if check_entrez_geneid(line[4]): | |
434 if line[4] not in dico_geneid_to_gene_name: | |
435 dico_geneid_to_gene_name[line[4]]=line[3] | |
436 if line[4] not in dico_protein_name: | |
437 dico_protein_name[line[4]]=line[5] | |
438 | |
439 with requests.Session() as s: | |
440 r = s.get('http://proteincomplexes.org/static/downloads/pairsWprob.txt') | |
441 r = r.content.decode('utf-8') | |
442 humap = csv.reader(r.splitlines(), delimiter='\t') | |
443 | |
444 dico_network = {} | |
445 for line in humap : | |
446 if check_entrez_geneid(line[0]) and check_entrez_geneid(line[1]): | |
447 | |
448 interactant_A, interactant_B = get_interactant_name(line,dico_geneid_to_gene_name) | |
449 | |
450 #first interactant (first column) | |
451 if line[0] not in dico_network: | |
452 dico_network[line[0]]=[line[:2]+[interactant_A,interactant_B,line[2]]] | |
453 else : | |
454 dico_network[line[0]].append(line[:2]+[interactant_A,interactant_B,line[2]]) | |
455 | |
456 #second interactant (second column) | |
457 if line[1] not in dico_network: | |
458 dico_network[line[1]]=[[line[1],line[0],interactant_B,interactant_A,line[2]]] | |
459 else : | |
460 dico_network[line[1]].append([line[1],line[0],interactant_B,interactant_A,line[2]]) | |
461 | |
462 with requests.Session() as s: | |
463 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | |
464 r.encoding ="utf-8" | |
465 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | |
466 | |
467 dico_nodes = {} | |
468 geneid_index=0 | |
469 pathway_description_index=3 | |
470 species_index=5 | |
471 for line in tab_file : | |
472 if line[species_index]==species_dict[species]: | |
473 #Fill dictionary with pathways | |
474 if line[geneid_index] in dico_nodes : | |
475 dico_nodes[line[geneid_index]].append(line[pathway_description_index]) | |
476 else : | |
477 dico_nodes[line[geneid_index]] = [line[pathway_description_index]] | |
478 | |
479 dico={} | |
480 dico['network']=dico_network | |
481 dico['nodes']=dico_nodes | |
482 dico['gene_name']=dico_geneid_to_gene_name | |
483 dico['protein_name']=dico_protein_name | |
484 | |
485 #writing output | |
486 output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json" | |
487 path = os.path.join(target_directory,output_file) | |
488 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") | |
489 id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y") | |
490 | |
491 with open(path, 'w') as handle: | |
492 json.dump(dico, handle, sort_keys=True) | |
493 | |
494 data_table_entry = dict(id=id, name = name, species = species, value = path) | |
495 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") | |
496 | |
497 ####################################################################################################### | |
498 # 5. nextprot (add protein features) | |
499 ####################################################################################################### | |
500 | |
501 def Build_nextprot_ref_file(data_manager_dict,target_directory): | |
502 nextprot_ids_file = "nextprot_ac_list_all.txt" | |
503 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) | |
504 | |
505 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] | |
506 for id in ids : | |
507 #print (id) | |
508 query="https://api.nextprot.org/entry/"+id+".json" | |
509 resp = requests.get(url=query) | |
510 data = resp.json() | |
511 | |
512 #get info from json dictionary | |
513 mass_mol = data["entry"]["isoforms"][0]["massAsString"] | |
514 seq_length = data['entry']["isoforms"][0]["sequenceLength"] | |
515 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] | |
516 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] | |
517 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) | |
518 | |
519 #put all subcell loc in a set | |
520 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : | |
521 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] | |
522 all_subcell_locs = set() | |
523 for loc in subcell_locs : | |
524 all_subcell_locs.add(loc['cvTermName']) | |
525 all_subcell_locs.discard("") | |
526 all_subcell_locs = ";".join(all_subcell_locs) | |
527 else : | |
528 all_subcell_locs = "NA" | |
529 | |
530 #put all subcell loc in a set | |
531 if ('disease') in data['entry']['annotationsByCategory'].keys() : | |
532 diseases = data['entry']['annotationsByCategory']['disease'] | |
533 all_diseases = set() | |
534 for disease in diseases : | |
535 if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): | |
536 all_diseases.add(disease['cvTermName']) | |
537 if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) | |
538 else : all_diseases="NA" | |
539 else : | |
540 all_diseases="NA" | |
541 | |
542 #get all tm domain | |
543 nb_domains = 0 | |
544 if "domain" in data['entry']['annotationsByCategory'].keys(): | |
545 tm_domains = data['entry']['annotationsByCategory']["domain"] | |
546 for tm_domain in tm_domains : | |
547 if "properties" in tm_domain.keys() and tm_domain['properties']!=[]: | |
548 domains = tm_domains["properties"] | |
549 for domain in domains : | |
550 if domain["name"]=="region structure" and domain["value"]=="Helical" : | |
551 nb_domains+=1 | |
552 | |
553 | |
554 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) | |
555 | |
556 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" | |
557 path = os.path.join(target_directory,output_file) | |
558 name = "neXtProt release "+time.strftime("%d-%m-%Y") | |
559 id = "nextprot_ref_"+time.strftime("%d-%m-%Y") | |
560 | |
561 with open(path, 'w') as output: | |
562 writer = csv.writer(output,delimiter="\t") | |
563 writer.writerows(nextprot_file) | |
564 | |
565 data_table_entry = dict(id=id, name = name, value = path) | |
566 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") | |
567 | |
568 ####################################################################################################### | |
569 # Main function | |
570 ####################################################################################################### | |
571 def main(): | |
572 parser = argparse.ArgumentParser() | |
573 parser.add_argument("--hpa", metavar = ("HPA_OPTION")) | |
574 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID")) | |
575 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) | |
576 parser.add_argument("--interactome", metavar = ("PPI")) | |
577 parser.add_argument("--species") | |
578 parser.add_argument("--date") | |
579 parser.add_argument("-o", "--output") | |
580 parser.add_argument("--database") | |
581 args = parser.parse_args() | |
582 | |
583 data_manager_dict = {} | |
584 # Extract json file params | |
585 filename = args.output | |
586 params = from_json_string(open(filename).read()) | |
587 target_directory = params[ 'output_data' ][0]['extra_files_path'] | |
588 os.mkdir(target_directory) | |
589 | |
590 ## Download source files from HPA | |
591 try: | |
592 hpa = args.hpa | |
593 except NameError: | |
594 hpa = None | |
595 if hpa is not None: | |
596 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" | |
597 hpa = hpa.split(",") | |
598 for hpa_tissue in hpa: | |
599 HPA_sources(data_manager_dict, hpa_tissue, target_directory) | |
600 | |
601 ## Download source file from Peptide Atlas query | |
602 try: | |
603 peptide_atlas = args.peptideatlas | |
604 date = args.date | |
605 except NameError: | |
606 peptide_atlas = None | |
607 if peptide_atlas is not None: | |
608 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" | |
609 peptide_atlas = peptide_atlas.split(",") | |
610 for pa_tissue in peptide_atlas: | |
611 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory) | |
612 | |
613 ## Download ID_mapping source file from Uniprot | |
614 try: | |
615 id_mapping=args.id_mapping | |
616 except NameError: | |
617 id_mapping = None | |
618 if id_mapping is not None: | |
619 id_mapping = id_mapping .split(",") | |
620 for species in id_mapping : | |
621 id_mapping_sources(data_manager_dict, species, target_directory) | |
622 | |
623 ## Download PPI ref files from biogrid/bioplex/humap | |
624 try: | |
625 interactome=args.interactome | |
626 if interactome == "biogrid" : | |
627 species=args.species | |
628 else : | |
629 species="Human" | |
630 except NameError: | |
631 interactome=None | |
632 species=None | |
633 if interactome is not None and species is not None: | |
634 PPI_ref_files(data_manager_dict, species, interactome, target_directory) | |
635 | |
636 ## Build nextprot ref file for add protein features | |
637 try: | |
638 database=args.database | |
639 except NameError: | |
640 database=None | |
641 if database is not None : | |
642 Build_nextprot_ref_file(data_manager_dict,target_directory) | |
643 | |
644 #save info to json file | |
645 filename = args.output | |
646 open(filename, 'wb').write(to_json_string(data_manager_dict)) | |
647 | |
648 if __name__ == "__main__": | |
649 main() |