Mercurial > repos > proteore > proteore_data_manager
comparison data_manager/resource_building.py @ 0:9e31ea9fc7ea draft
planemo upload commit 567ba7934c0ca55529dfeb5e7ca0935ace260ad7-dirty
| author | proteore |
|---|---|
| date | Wed, 13 Mar 2019 06:30:42 -0400 |
| parents | |
| children | f3507260b30f |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9e31ea9fc7ea |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 """ | |
| 3 The purpose of this script is to create source files from different databases to be used in other proteore tools | |
| 4 """ | |
| 5 | |
| 6 import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile | |
| 7 from io import BytesIO | |
| 8 from zipfile import ZipFile | |
| 9 from galaxy.util.json import from_json_string, to_json_string | |
| 10 | |
| 11 ####################################################################################################### | |
| 12 # General functions | |
| 13 ####################################################################################################### | |
| 14 def unzip(url, output_file): | |
| 15 """ | |
| 16 Get a zip file content from a link and unzip | |
| 17 """ | |
| 18 content = requests.get(url) | |
| 19 zipfile = ZipFile(BytesIO(content.content)) | |
| 20 output_content = "" | |
| 21 output_content += zipfile.open(zipfile.namelist()[0]).read() | |
| 22 output = open(output_file, "w") | |
| 23 output.write(output_content) | |
| 24 output.close() | |
| 25 | |
| 26 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): | |
| 27 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) | |
| 28 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) | |
| 29 data_manager_dict['data_tables'][data_table].append(data_table_entry) | |
| 30 return data_manager_dict | |
| 31 | |
| 32 ####################################################################################################### | |
| 33 # 1. Human Protein Atlas | |
| 34 # - Normal tissue | |
| 35 # - Pathology | |
| 36 # - Full Atlas | |
| 37 ####################################################################################################### | |
| 38 def HPA_sources(data_manager_dict, tissue, target_directory): | |
| 39 if tissue == "HPA_normal_tissue": | |
| 40 tissue_name = "HPA normal tissue" | |
| 41 url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip" | |
| 42 table = "proteore_protein_atlas_normal_tissue" | |
| 43 elif tissue == "HPA_pathology": | |
| 44 tissue_name = "HPA pathology" | |
| 45 url = "https://www.proteinatlas.org/download/pathology.tsv.zip" | |
| 46 table = "proteore_protein_atlas_tumor_tissue" | |
| 47 elif tissue == "HPA_full_atlas": | |
| 48 tissue_name = "HPA full atlas" | |
| 49 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip" | |
| 50 table = "proteore_protein_full_atlas" | |
| 51 | |
| 52 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" | |
| 53 path = os.path.join(target_directory, output_file) | |
| 54 unzip(url, path) #download and save file | |
| 55 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") | |
| 56 tissue_id = tissue_name.replace(" ","_").replace("/","-") | |
| 57 | |
| 58 | |
| 59 data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path) | |
| 60 _add_data_table_entry(data_manager_dict, data_table_entry, table) | |
| 61 | |
| 62 | |
| 63 ####################################################################################################### | |
| 64 # 2. Peptide Atlas | |
| 65 ####################################################################################################### | |
| 66 def peptide_atlas_sources(data_manager_dict, tissue, date, target_directory): | |
| 67 # Define organism_id (here Human) - to be upraded when other organism added to the project | |
| 68 organism_id = "2" | |
| 69 # Extract sample_category_id and output filename | |
| 70 tissue=tissue.split(".") | |
| 71 sample_category_id = tissue[0] | |
| 72 tissue_name = tissue[1] | |
| 73 output_file = tissue_name+"_"+date + ".tsv" | |
| 74 | |
| 75 query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+ \ | |
| 76 sample_category_id+"&display_options=ShowAbundances&organism_id="+organism_id+ \ | |
| 77 "&redundancy_constraint=4&presence_level_constraint=1%2C2&gene_annotation_level_constraint=leaf\ | |
| 78 &QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY" | |
| 79 | |
| 80 with requests.Session() as s: | |
| 81 download = s.get(query) | |
| 82 decoded_content = download.content.decode('utf-8') | |
| 83 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') | |
| 84 | |
| 85 uni_dict = build_dictionary(cr) | |
| 86 | |
| 87 #columns of data table peptide_atlas | |
| 88 tissue_id = tissue_name+"_"+date | |
| 89 name = tissue_id.replace("-","/").replace("_"," ") | |
| 90 path = os.path.join(target_directory,output_file) | |
| 91 | |
| 92 with open(path,"w") as out : | |
| 93 w = csv.writer(out,delimiter='\t') | |
| 94 w.writerow(["Uniprot_AC","nb_obs"]) | |
| 95 w.writerows(uni_dict.items()) | |
| 96 | |
| 97 data_table_entry = dict(id=tissue_id, name=name, value = path, tissue = tissue_name) | |
| 98 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_peptide_atlas") | |
| 99 | |
| 100 #function to count the number of observations by uniprot id | |
| 101 def build_dictionary (csv) : | |
| 102 uni_dict = {} | |
| 103 for line in csv : | |
| 104 if "-" not in line[0] and check_uniprot_access(line[0]) : | |
| 105 if line[0] in uni_dict : | |
| 106 uni_dict[line[0]] += int(line[5]) | |
| 107 else : | |
| 108 uni_dict[line[0]] = int(line[5]) | |
| 109 | |
| 110 return uni_dict | |
| 111 | |
| 112 #function to check if an id is an uniprot accession number : return True or False- | |
| 113 def check_uniprot_access (id) : | |
| 114 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | |
| 115 if uniprot_pattern.match(id) : | |
| 116 return True | |
| 117 else : | |
| 118 return False | |
| 119 | |
| 120 def check_entrez_geneid (id) : | |
| 121 entrez_pattern = re.compile("[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+") | |
| 122 if entrez_pattern.match(id) : | |
| 123 return True | |
| 124 else : | |
| 125 return False | |
| 126 | |
| 127 ####################################################################################################### | |
| 128 # 3. ID mapping file | |
| 129 ####################################################################################################### | |
| 130 import ftplib, gzip | |
| 131 csv.field_size_limit(sys.maxsize) # to handle big files | |
| 132 | |
| 133 def id_mapping_sources (data_manager_dict, species, target_directory) : | |
| 134 | |
| 135 human = species == "Human" | |
| 136 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } | |
| 137 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] | |
| 138 | |
| 139 #header | |
| 140 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] | |
| 141 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] | |
| 142 | |
| 143 #print("header ok") | |
| 144 | |
| 145 #get selected.tab and keep only ids of interest | |
| 146 selected_tab_file=species_dict[species]+"_"+files[0] | |
| 147 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) | |
| 148 with gzip.open(tab_path,"rt") as select : | |
| 149 tab_reader = csv.reader(select,delimiter="\t") | |
| 150 for line in tab_reader : | |
| 151 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) | |
| 152 os.remove(tab_path) | |
| 153 | |
| 154 #print("selected_tab ok") | |
| 155 | |
| 156 """ | |
| 157 Supplementary ID to get from HUMAN_9606_idmapping.dat : | |
| 158 -NextProt,BioGrid,STRING,KEGG | |
| 159 """ | |
| 160 | |
| 161 #there's more id type for human | |
| 162 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file | |
| 163 else : ids = ['BioGrid','STRING','KEGG' ] | |
| 164 unidict = {} | |
| 165 | |
| 166 #keep only ids of interest in dictionaries | |
| 167 dat_file=species_dict[species]+"_"+files[1] | |
| 168 dat_path = download_from_uniprot_ftp(dat_file,target_directory) | |
| 169 with gzip.open(dat_path,"rt") as dat : | |
| 170 dat_reader = csv.reader(dat,delimiter="\t") | |
| 171 for line in dat_reader : | |
| 172 uniprotID=line[0] #UniProtID as key | |
| 173 id_type=line[1] #ID type of corresponding id, key of sub-dictionnary | |
| 174 cor_id=line[2] #corresponding id | |
| 175 if "-" not in id_type : #we don't keep isoform | |
| 176 if id_type in ids and uniprotID in unidict : | |
| 177 if id_type in unidict[uniprotID] : | |
| 178 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary | |
| 179 else : | |
| 180 unidict[uniprotID].update({ id_type : cor_id }) | |
| 181 elif id_type in ids : | |
| 182 unidict[uniprotID]={id_type : cor_id} | |
| 183 os.remove(dat_path) | |
| 184 | |
| 185 #print("dat_file ok") | |
| 186 | |
| 187 #add ids from idmapping.dat to the final tab | |
| 188 for line in tab[1:] : | |
| 189 uniprotID=line[0] | |
| 190 if human : | |
| 191 if uniprotID in unidict : | |
| 192 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') | |
| 193 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) | |
| 194 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | |
| 195 access_dictionary(unidict,uniprotID,'KEGG')]) | |
| 196 else : | |
| 197 line.extend(["","","",""]) | |
| 198 else : | |
| 199 if uniprotID in unidict : | |
| 200 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | |
| 201 access_dictionary(unidict,uniprotID,'KEGG')]) | |
| 202 else : | |
| 203 line.extend(["","",""]) | |
| 204 | |
| 205 #print ("tab ok") | |
| 206 | |
| 207 #add missing nextprot ID for human | |
| 208 if human : | |
| 209 #build next_dict | |
| 210 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) | |
| 211 next_dict = {} | |
| 212 for nextid in nextprot_ids : | |
| 213 next_dict[nextid.replace("NX_","")] = nextid | |
| 214 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) | |
| 215 | |
| 216 #add missing nextprot ID | |
| 217 for line in tab[1:] : | |
| 218 uniprotID=line[0] | |
| 219 nextprotID=line[13] | |
| 220 if nextprotID == '' and uniprotID in next_dict : | |
| 221 line[13]=next_dict[uniprotID] | |
| 222 | |
| 223 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" | |
| 224 path = os.path.join(target_directory,output_file) | |
| 225 | |
| 226 with open(path,"w") as out : | |
| 227 w = csv.writer(out,delimiter='\t') | |
| 228 w.writerows(tab) | |
| 229 | |
| 230 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} | |
| 231 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" | |
| 232 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") | |
| 233 | |
| 234 data_table_entry = dict(id=id, name = name, species = species, value = path) | |
| 235 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) | |
| 236 | |
| 237 def download_from_uniprot_ftp(file,target_directory) : | |
| 238 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" | |
| 239 path = os.path.join(target_directory, file) | |
| 240 ftp = ftplib.FTP("ftp.uniprot.org") | |
| 241 ftp.login("anonymous", "anonymous") | |
| 242 ftp.cwd(ftp_dir) | |
| 243 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | |
| 244 ftp.quit() | |
| 245 return (path) | |
| 246 | |
| 247 def id_list_from_nextprot_ftp(file,target_directory) : | |
| 248 ftp_dir = "pub/current_release/ac_lists/" | |
| 249 path = os.path.join(target_directory, file) | |
| 250 ftp = ftplib.FTP("ftp.nextprot.org") | |
| 251 ftp.login("anonymous", "anonymous") | |
| 252 ftp.cwd(ftp_dir) | |
| 253 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | |
| 254 ftp.quit() | |
| 255 with open(path,'r') as nextprot_ids : | |
| 256 nextprot_ids = nextprot_ids.read().splitlines() | |
| 257 return (nextprot_ids) | |
| 258 | |
| 259 #return '' if there's no value in a dictionary, avoid error | |
| 260 def access_dictionary (dico,key1,key2) : | |
| 261 if key1 in dico : | |
| 262 if key2 in dico[key1] : | |
| 263 return (dico[key1][key2]) | |
| 264 else : | |
| 265 return ("") | |
| 266 #print (key2,"not in ",dico,"[",key1,"]") | |
| 267 else : | |
| 268 return ('') | |
| 269 | |
| 270 #if there are several nextprot ID for one uniprotID, return the uniprot like ID | |
| 271 def clean_nextprot_id (next_id,uniprotAc) : | |
| 272 if len(next_id.split(";")) > 1 : | |
| 273 tmp = next_id.split(";") | |
| 274 if "NX_"+uniprotAc in tmp : | |
| 275 return ("NX_"+uniprotAc) | |
| 276 else : | |
| 277 return (tmp[1]) | |
| 278 else : | |
| 279 return (next_id) | |
| 280 | |
| 281 | |
| 282 ####################################################################################################### | |
| 283 # 4. Build protein interaction maps files | |
| 284 ####################################################################################################### | |
| 285 | |
| 286 def get_interactant_name(line,dico): | |
| 287 | |
| 288 if line[0] in dico : | |
| 289 interactant_A = dico[line[0]] | |
| 290 else : | |
| 291 interactant_A = "NA" | |
| 292 | |
| 293 if line[1] in dico : | |
| 294 interactant_B = dico[line[1]] | |
| 295 else : | |
| 296 interactant_B = "NA" | |
| 297 | |
| 298 return interactant_A, interactant_B | |
| 299 | |
| 300 def PPI_ref_files(data_manager_dict, species, interactome, target_directory): | |
| 301 | |
| 302 species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"} | |
| 303 | |
| 304 ##BioGRID | |
| 305 if interactome=="biogrid": | |
| 306 | |
| 307 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip" | |
| 308 | |
| 309 #download zip file | |
| 310 r = requests.get(tab2_link) | |
| 311 with open("BioGRID.zip", "wb") as code: | |
| 312 code.write(r.content) | |
| 313 | |
| 314 #unzip files | |
| 315 with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref: | |
| 316 if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID") | |
| 317 zip_ref.extractall("tmp_BioGRID") | |
| 318 | |
| 319 #import file of interest and build dictionary | |
| 320 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt" | |
| 321 with open(file_path,"r") as handle : | |
| 322 tab_file = csv.reader(handle,delimiter="\t") | |
| 323 dico_network = {} | |
| 324 GeneID_index=1 | |
| 325 network_cols=[1,2,7,8,11,12,14,18,20] | |
| 326 for line in tab_file : | |
| 327 if line[GeneID_index] not in dico_network: | |
| 328 dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]] | |
| 329 else: | |
| 330 dico_network[line[GeneID_index]].append([line[i] for i in network_cols]) | |
| 331 | |
| 332 #delete tmp_BioGRID directory | |
| 333 os.remove("BioGRID.zip") | |
| 334 shutil.rmtree("tmp_BioGRID", ignore_errors=True) | |
| 335 | |
| 336 #download NCBI2Reactome.txt file and build dictionary | |
| 337 with requests.Session() as s: | |
| 338 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | |
| 339 r.encoding ="utf-8" | |
| 340 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | |
| 341 | |
| 342 dico_nodes = {} | |
| 343 geneid_index=0 | |
| 344 pathway_description_index=3 | |
| 345 species_index=5 | |
| 346 for line in tab_file : | |
| 347 if line[species_index]==species_dict[species]: | |
| 348 if line[geneid_index] in dico_nodes : | |
| 349 dico_nodes[line[geneid_index]].append(line[pathway_description_index]) | |
| 350 else : | |
| 351 dico_nodes[line[geneid_index]] = [line[pathway_description_index]] | |
| 352 | |
| 353 dico={} | |
| 354 dico['network']=dico_network | |
| 355 dico['nodes']=dico_nodes | |
| 356 | |
| 357 ##Bioplex | |
| 358 elif interactome=="bioplex": | |
| 359 | |
| 360 with requests.Session() as s: | |
| 361 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv') | |
| 362 r = r.content.decode('utf-8') | |
| 363 bioplex = csv.reader(r.splitlines(), delimiter='\t') | |
| 364 | |
| 365 dico_network = {} | |
| 366 dico_network["GeneID"]={} | |
| 367 network_geneid_cols=[0,1,4,5,8] | |
| 368 dico_network["UniProt-AC"]={} | |
| 369 network_uniprot_cols=[2,3,4,5,8] | |
| 370 dico_GeneID_to_UniProt = {} | |
| 371 for line in bioplex : | |
| 372 if line[0] not in dico_network["GeneID"]: | |
| 373 dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]] | |
| 374 else : | |
| 375 dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols]) | |
| 376 if line[1] not in dico_network["UniProt-AC"]: | |
| 377 dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]] | |
| 378 else: | |
| 379 dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols]) | |
| 380 dico_GeneID_to_UniProt[line[0]]=line[2] | |
| 381 | |
| 382 with requests.Session() as s: | |
| 383 r = s.get('https://reactome.org/download/current/UniProt2Reactome.txt') | |
| 384 r.encoding ="utf-8" | |
| 385 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | |
| 386 | |
| 387 dico_nodes_uniprot = {} | |
| 388 uniProt_index=0 | |
| 389 pathway_description_index=3 | |
| 390 species_index=5 | |
| 391 for line in tab_file : | |
| 392 if line[species_index]==species_dict[species]: | |
| 393 if line[uniProt_index] in dico_nodes_uniprot : | |
| 394 dico_nodes_uniprot[line[uniProt_index]].append(line[pathway_description_index]) | |
| 395 else : | |
| 396 dico_nodes_uniprot[line[uniProt_index]] = [line[pathway_description_index]] | |
| 397 | |
| 398 with requests.Session() as s: | |
| 399 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | |
| 400 r.encoding ="utf-8" | |
| 401 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | |
| 402 | |
| 403 dico_nodes_geneid = {} | |
| 404 geneid_index=0 | |
| 405 pathway_description_index=3 | |
| 406 species_index=5 | |
| 407 for line in tab_file : | |
| 408 if line[species_index]==species_dict[species]: | |
| 409 if line[geneid_index] in dico_nodes_geneid : | |
| 410 dico_nodes_geneid[line[geneid_index]].append(line[pathway_description_index]) | |
| 411 else : | |
| 412 dico_nodes_geneid[line[geneid_index]] = [line[pathway_description_index]] | |
| 413 | |
| 414 dico={} | |
| 415 dico_nodes={} | |
| 416 dico_nodes['GeneID']=dico_nodes_geneid | |
| 417 dico_nodes['UniProt-AC']=dico_nodes_uniprot | |
| 418 dico['network']=dico_network | |
| 419 dico['nodes']=dico_nodes | |
| 420 dico['convert']=dico_GeneID_to_UniProt | |
| 421 | |
| 422 ##Humap | |
| 423 elif interactome=="humap": | |
| 424 | |
| 425 with requests.Session() as s: | |
| 426 r = s.get('http://proteincomplexes.org/static/downloads/nodeTable.txt') | |
| 427 r = r.content.decode('utf-8') | |
| 428 humap_nodes = csv.reader(r.splitlines(), delimiter=',') | |
| 429 | |
| 430 dico_geneid_to_gene_name={} | |
| 431 dico_protein_name={} | |
| 432 for line in humap_nodes : | |
| 433 if check_entrez_geneid(line[4]): | |
| 434 if line[4] not in dico_geneid_to_gene_name: | |
| 435 dico_geneid_to_gene_name[line[4]]=line[3] | |
| 436 if line[4] not in dico_protein_name: | |
| 437 dico_protein_name[line[4]]=line[5] | |
| 438 | |
| 439 with requests.Session() as s: | |
| 440 r = s.get('http://proteincomplexes.org/static/downloads/pairsWprob.txt') | |
| 441 r = r.content.decode('utf-8') | |
| 442 humap = csv.reader(r.splitlines(), delimiter='\t') | |
| 443 | |
| 444 dico_network = {} | |
| 445 for line in humap : | |
| 446 if check_entrez_geneid(line[0]) and check_entrez_geneid(line[1]): | |
| 447 | |
| 448 interactant_A, interactant_B = get_interactant_name(line,dico_geneid_to_gene_name) | |
| 449 | |
| 450 #first interactant (first column) | |
| 451 if line[0] not in dico_network: | |
| 452 dico_network[line[0]]=[line[:2]+[interactant_A,interactant_B,line[2]]] | |
| 453 else : | |
| 454 dico_network[line[0]].append(line[:2]+[interactant_A,interactant_B,line[2]]) | |
| 455 | |
| 456 #second interactant (second column) | |
| 457 if line[1] not in dico_network: | |
| 458 dico_network[line[1]]=[[line[1],line[0],interactant_B,interactant_A,line[2]]] | |
| 459 else : | |
| 460 dico_network[line[1]].append([line[1],line[0],interactant_B,interactant_A,line[2]]) | |
| 461 | |
| 462 with requests.Session() as s: | |
| 463 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') | |
| 464 r.encoding ="utf-8" | |
| 465 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') | |
| 466 | |
| 467 dico_nodes = {} | |
| 468 geneid_index=0 | |
| 469 pathway_description_index=3 | |
| 470 species_index=5 | |
| 471 for line in tab_file : | |
| 472 if line[species_index]==species_dict[species]: | |
| 473 #Fill dictionary with pathways | |
| 474 if line[geneid_index] in dico_nodes : | |
| 475 dico_nodes[line[geneid_index]].append(line[pathway_description_index]) | |
| 476 else : | |
| 477 dico_nodes[line[geneid_index]] = [line[pathway_description_index]] | |
| 478 | |
| 479 dico={} | |
| 480 dico['network']=dico_network | |
| 481 dico['nodes']=dico_nodes | |
| 482 dico['gene_name']=dico_geneid_to_gene_name | |
| 483 dico['protein_name']=dico_protein_name | |
| 484 | |
| 485 #writing output | |
| 486 output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json" | |
| 487 path = os.path.join(target_directory,output_file) | |
| 488 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") | |
| 489 id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y") | |
| 490 | |
| 491 with open(path, 'w') as handle: | |
| 492 json.dump(dico, handle, sort_keys=True) | |
| 493 | |
| 494 data_table_entry = dict(id=id, name = name, species = species, value = path) | |
| 495 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") | |
| 496 | |
| 497 ####################################################################################################### | |
| 498 # 5. nextprot (add protein features) | |
| 499 ####################################################################################################### | |
| 500 | |
| 501 def Build_nextprot_ref_file(data_manager_dict,target_directory): | |
| 502 nextprot_ids_file = "nextprot_ac_list_all.txt" | |
| 503 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) | |
| 504 | |
| 505 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] | |
| 506 for id in ids : | |
| 507 #print (id) | |
| 508 query="https://api.nextprot.org/entry/"+id+".json" | |
| 509 resp = requests.get(url=query) | |
| 510 data = resp.json() | |
| 511 | |
| 512 #get info from json dictionary | |
| 513 mass_mol = data["entry"]["isoforms"][0]["massAsString"] | |
| 514 seq_length = data['entry']["isoforms"][0]["sequenceLength"] | |
| 515 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] | |
| 516 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] | |
| 517 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) | |
| 518 | |
| 519 #put all subcell loc in a set | |
| 520 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : | |
| 521 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] | |
| 522 all_subcell_locs = set() | |
| 523 for loc in subcell_locs : | |
| 524 all_subcell_locs.add(loc['cvTermName']) | |
| 525 all_subcell_locs.discard("") | |
| 526 all_subcell_locs = ";".join(all_subcell_locs) | |
| 527 else : | |
| 528 all_subcell_locs = "NA" | |
| 529 | |
| 530 #put all subcell loc in a set | |
| 531 if ('disease') in data['entry']['annotationsByCategory'].keys() : | |
| 532 diseases = data['entry']['annotationsByCategory']['disease'] | |
| 533 all_diseases = set() | |
| 534 for disease in diseases : | |
| 535 if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): | |
| 536 all_diseases.add(disease['cvTermName']) | |
| 537 if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) | |
| 538 else : all_diseases="NA" | |
| 539 else : | |
| 540 all_diseases="NA" | |
| 541 | |
| 542 #get all tm domain | |
| 543 nb_domains = 0 | |
| 544 if "domain" in data['entry']['annotationsByCategory'].keys(): | |
| 545 tm_domains = data['entry']['annotationsByCategory']["domain"] | |
| 546 for tm_domain in tm_domains : | |
| 547 if "properties" in tm_domain.keys() and tm_domain['properties']!=[]: | |
| 548 domains = tm_domains["properties"] | |
| 549 for domain in domains : | |
| 550 if domain["name"]=="region structure" and domain["value"]=="Helical" : | |
| 551 nb_domains+=1 | |
| 552 | |
| 553 | |
| 554 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) | |
| 555 | |
| 556 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" | |
| 557 path = os.path.join(target_directory,output_file) | |
| 558 name = "neXtProt release "+time.strftime("%d-%m-%Y") | |
| 559 id = "nextprot_ref_"+time.strftime("%d-%m-%Y") | |
| 560 | |
| 561 with open(path, 'w') as output: | |
| 562 writer = csv.writer(output,delimiter="\t") | |
| 563 writer.writerows(nextprot_file) | |
| 564 | |
| 565 data_table_entry = dict(id=id, name = name, value = path) | |
| 566 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") | |
| 567 | |
| 568 ####################################################################################################### | |
| 569 # Main function | |
| 570 ####################################################################################################### | |
| 571 def main(): | |
| 572 parser = argparse.ArgumentParser() | |
| 573 parser.add_argument("--hpa", metavar = ("HPA_OPTION")) | |
| 574 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID")) | |
| 575 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) | |
| 576 parser.add_argument("--interactome", metavar = ("PPI")) | |
| 577 parser.add_argument("--species") | |
| 578 parser.add_argument("--date") | |
| 579 parser.add_argument("-o", "--output") | |
| 580 parser.add_argument("--database") | |
| 581 args = parser.parse_args() | |
| 582 | |
| 583 data_manager_dict = {} | |
| 584 # Extract json file params | |
| 585 filename = args.output | |
| 586 params = from_json_string(open(filename).read()) | |
| 587 target_directory = params[ 'output_data' ][0]['extra_files_path'] | |
| 588 os.mkdir(target_directory) | |
| 589 | |
| 590 ## Download source files from HPA | |
| 591 try: | |
| 592 hpa = args.hpa | |
| 593 except NameError: | |
| 594 hpa = None | |
| 595 if hpa is not None: | |
| 596 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" | |
| 597 hpa = hpa.split(",") | |
| 598 for hpa_tissue in hpa: | |
| 599 HPA_sources(data_manager_dict, hpa_tissue, target_directory) | |
| 600 | |
| 601 ## Download source file from Peptide Atlas query | |
| 602 try: | |
| 603 peptide_atlas = args.peptideatlas | |
| 604 date = args.date | |
| 605 except NameError: | |
| 606 peptide_atlas = None | |
| 607 if peptide_atlas is not None: | |
| 608 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" | |
| 609 peptide_atlas = peptide_atlas.split(",") | |
| 610 for pa_tissue in peptide_atlas: | |
| 611 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory) | |
| 612 | |
| 613 ## Download ID_mapping source file from Uniprot | |
| 614 try: | |
| 615 id_mapping=args.id_mapping | |
| 616 except NameError: | |
| 617 id_mapping = None | |
| 618 if id_mapping is not None: | |
| 619 id_mapping = id_mapping .split(",") | |
| 620 for species in id_mapping : | |
| 621 id_mapping_sources(data_manager_dict, species, target_directory) | |
| 622 | |
| 623 ## Download PPI ref files from biogrid/bioplex/humap | |
| 624 try: | |
| 625 interactome=args.interactome | |
| 626 if interactome == "biogrid" : | |
| 627 species=args.species | |
| 628 else : | |
| 629 species="Human" | |
| 630 except NameError: | |
| 631 interactome=None | |
| 632 species=None | |
| 633 if interactome is not None and species is not None: | |
| 634 PPI_ref_files(data_manager_dict, species, interactome, target_directory) | |
| 635 | |
| 636 ## Build nextprot ref file for add protein features | |
| 637 try: | |
| 638 database=args.database | |
| 639 except NameError: | |
| 640 database=None | |
| 641 if database is not None : | |
| 642 Build_nextprot_ref_file(data_manager_dict,target_directory) | |
| 643 | |
| 644 #save info to json file | |
| 645 filename = args.output | |
| 646 open(filename, 'wb').write(to_json_string(data_manager_dict)) | |
| 647 | |
| 648 if __name__ == "__main__": | |
| 649 main() |
