Mercurial > repos > proteore > proteore_build_protein_interaction_maps
diff build_protein_interaction_maps.py @ 0:b0ac71686b99 draft
planemo upload commit 968cd5b4f78f0a1da86fc3bc29f8159f86e199aa-dirty
author | proteore |
---|---|
date | Tue, 12 Mar 2019 05:55:54 -0400 |
parents | |
children | 0a85d709c4ae |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/build_protein_interaction_maps.py Tue Mar 12 05:55:54 2019 -0400 @@ -0,0 +1,276 @@ +# -*- coding: utf-8 -*- +import csv, json, argparse, re + +def get_args() : + parser = argparse.ArgumentParser() + parser.add_argument("--species") + parser.add_argument("--database", help="Humap, Bioplex or Biogrid", required=True) + parser.add_argument("--dict_path", required=True) + parser.add_argument("--input_type", help="type of input (list of id or filename)",required=True) + parser.add_argument("--input", required=True) + parser.add_argument("--header") + parser.add_argument("--ncol") + parser.add_argument("--id_type") + parser.add_argument("--network_output") + parser.add_argument("--nodes_output") + args = parser.parse_args() + + if args.input_type=="file" : + args.ncol = nb_col_to_int(args.ncol) + args.header = str2bool(args.header) + + return args + +#Turn string into boolean +def str2bool(v): + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + +#return the column number in int format +def nb_col_to_int(nb_col): + try : + nb_col = int(nb_col.replace("c", "")) - 1 + return nb_col + except : + sys.exit("Please specify the column where you would like to apply the filter with valid format") + +#return list of (unique) ids from string +def get_input_ids_from_string(input) : + ids_list = list(set(re.split(r'\s+',input.replace(";"," ").replace("\r","").replace("\n"," ").replace("\t"," ")))) + if "" in ids_list : ids_list.remove("") + #if "NA" in ids_list : ids_list.remove("NA") + return ids_list + +#return input_file and list of unique ids from input file path +def get_input_ids_from_file(input,nb_col,header) : + with open(input, "r") as csv_file : + input_file= list(csv.reader(csv_file, delimiter='\t')) + + input_file, ids_list = one_id_one_line(input_file,nb_col,header) + if "" in ids_list : ids_list.remove("") + #if "NA" in ids_list : ids_list.remove("NA") + + return input_file, ids_list + +#return input file by adding lines when there are more than one id per line +def one_id_one_line(input_file,nb_col,header) : + + if header : + new_file = [input_file[0]] + input_file = input_file[1:] + else : + new_file=[] + ids_list=[] + + for line in input_file : + if line != [] and set(line) != {''}: + line[nb_col] = re.sub(r"\s+","",line[nb_col]) + if ";" in line[nb_col] : + ids = line[nb_col].split(";") + for id in ids : + new_file.append(line[:nb_col]+[id]+line[nb_col+1:]) + ids_list.append(id) + else : + new_file.append(line) + ids_list.append(line[nb_col]) + + ids_list= list(set(ids_list)) + + return new_file, ids_list + +#replace all blank cells to NA +def blank_to_NA(csv_file) : + tmp=[] + for line in csv_file : + line = ["NA" if cell=="" or cell==" " or cell=="NaN" or cell=="-" else cell for cell in line] + tmp.append(line) + + return tmp + +def biogrid_output_files(ids,species) : + network_file=[["Entrez Gene Interactor A","Entrez Gene Interactor B","Gene symbol Interactor A","Gene symbol Interactor B","Experimental System","Experimental Type","Pubmed ID","Interaction Score","Phenotypes"]] + ids_set= set(ids) + ids_not_found=set([]) + for id in ids : + if id in ppi_dict['network'] : + network_file.extend(ppi_dict['network'][id]) + ids_set.update([interact[1] for interact in ppi_dict['network'][id]]) + else : + ids_not_found.add(id) + + nodes_file = [["Entrez gene ID","Official Symbol Interactor","Present in user input ids","ID present in Biogrid "+species,"Pathway"]] + for id in ids_set: + #get pathway + if id in ppi_dict['nodes']: + description_pathway=";".join(ppi_dict['nodes'][id]) + else : + description_pathway="NA" + + #get gene name + if id in ppi_dict['network']: gene_name = ppi_dict['network'][id][0][2] + else : gene_name="NA" + + #make line + nodes_file.append([id]+[gene_name]+[id in ids]+[id not in ids_not_found]+[description_pathway]) + + return network_file,nodes_file + +def bioplex_output_files(ids,id_type,species) : + network_file=[[id_type+" Interactor A",id_type+" Interactor B","Gene symbol Interactor A","Gene symbol Interactor B","Interaction Score"]] + ids_set= set(ids) + ids_not_found=set([]) + for id in ids : + if id in ppi_dict['network'][id_type] : + network_file.extend(ppi_dict['network'][id_type][id]) + ids_set.update([interact[1] for interact in ppi_dict['network'][id_type][id]]) + else : + ids_not_found.add(id) + + if id_type=="UniProt-AC" : nodes_file=[[id_type,"Present in user input ids","ID present in Human Bioplex","Pathway"]] + else: nodes_file=[[id_type,"Official symbol Interactor","Present in user input ids","Present in interactome","Pathway"]] + for id in ids_set: + + if id in ppi_dict['nodes'][id_type]: + description_pathway=";".join(ppi_dict['nodes'][id_type][id]) + else : + description_pathway="NA" + + #make line + if id_type=="UniProt-AC": + nodes_file.append([id]+[id in ids]+[id not in ids_not_found]+[description_pathway]) + elif id_type=="GeneID": + #get gene_name + if id in ppi_dict['network'][id_type]: gene_name = ppi_dict['network'][id_type][id][0][2] + else : gene_name="NA" + nodes_file.append([id]+[gene_name]+[id in ids]+[id not in ids_not_found]+[description_pathway]) + + return network_file,nodes_file + +def humap_output_files(ids,species) : + network_file=[["Entrez Gene Interactor A","Entrez Gene Interactor B","Gene symbol Interactor A","Gene symbol Interactor B","Interaction Score"]] + ids_set= set(ids) + ids_not_found=set([]) + for id in ids : + if id in ppi_dict['network'] : + network_file.extend(ppi_dict['network'][id]) + ids_set.update([interact[1] for interact in ppi_dict['network'][id]]) + else : + ids_not_found.add(id) + + nodes_file = [["Entrez gene ID","Official Symbol Interactor","Present in user input ids","ID present in Hu.MAP","Pathway"]] + for id in ids_set: + if id in ppi_dict['nodes']: + description_pathway=";".join(ppi_dict['nodes'][id]) + else : + description_pathway="NA" + + #get gene name + if id in ppi_dict['gene_name']: + gene_name = ppi_dict['gene_name'][id] + else : + gene_name = "NA" + + #make line + nodes_file.append([id]+[gene_name]+[id in ids]+[id not in ids_not_found]+[description_pathway]) + + return network_file,nodes_file + +#function to sort the csv_file by value in a specific column +def sort_by_column(tab,sort_col,reverse,header): + + if len(tab) > 1 : #if there's more than just a header or 1 row + if header : + head=tab[0] + tab=tab[1:] + + #list of empty cells in the column to sort + unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] + unsorted_tab=[ tab[i] for i in unsortable_lines] + tab= [line for i,line in enumerate(tab) if i not in unsortable_lines] + + if only_number(tab,sort_col) and any_float(tab,sort_col) : + tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) + elif only_number(tab,sort_col): + tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) + else : + tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) + + tab.extend(unsorted_tab) + if header is True : tab = [head]+tab + + return tab + +def only_number(tab,col) : + + for line in tab : + if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) : + return False + return True + +#Check if a variable is a float or an integer +def is_number(number_format, n): + float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") + int_format = re.compile(r"^[-]?[0-9][0-9]*$") + test = "" + if number_format == "int": + test = re.match(int_format, n) + elif number_format == "float": + test = re.match(float_format, n) + if test: + return True + +#return True is there is at least one float in the column +def any_float(tab,col) : + + for line in tab : + if is_number("float",line[col].replace(",",".")) : + return True + + return False + +def main() : + + #Get args from command line + global args + args = get_args() + + #get PPI dictionary + with open(args.dict_path, 'r') as handle: + global ppi_dict + ppi_dict = json.load(handle) + + #Get file and/or ids from input + if args.input_type == "text" : + ids = get_input_ids_from_string(args.input) + elif args.input_type == "file" : + input_file, ids = get_input_ids_from_file(args.input,args.ncol,args.header) + + #create output files + if args.database=="biogrid": + network_file, nodes_file = biogrid_output_files(ids,args.species) + elif args.database=="bioplex": + network_file, nodes_file = bioplex_output_files(ids,args.id_type,args.species) + elif args.database=="humap": + network_file, nodes_file = humap_output_files(ids,args.species) + + #convert blank to NA and sort files + network_file = blank_to_NA(network_file) + network_file = sort_by_column(network_file,0,False,True) + nodes_file = sort_by_column(nodes_file,0,False,True) + + #write output files + with open(args.network_output,"w") as output : + writer = csv.writer(output,delimiter="\t") + writer.writerows(network_file) + + with open(args.nodes_output,"w") as output : + writer = csv.writer(output,delimiter="\t") + for row in nodes_file: + writer.writerow([unicode(s).encode("utf-8") for s in row]) + +if __name__ == "__main__": + main() \ No newline at end of file