Mercurial > repos > proteore > proteore_build_protein_interaction_maps
comparison build_protein_interaction_maps.py @ 0:b0ac71686b99 draft
planemo upload commit 968cd5b4f78f0a1da86fc3bc29f8159f86e199aa-dirty
author | proteore |
---|---|
date | Tue, 12 Mar 2019 05:55:54 -0400 |
parents | |
children | 0a85d709c4ae |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b0ac71686b99 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 import csv, json, argparse, re | |
3 | |
4 def get_args() : | |
5 parser = argparse.ArgumentParser() | |
6 parser.add_argument("--species") | |
7 parser.add_argument("--database", help="Humap, Bioplex or Biogrid", required=True) | |
8 parser.add_argument("--dict_path", required=True) | |
9 parser.add_argument("--input_type", help="type of input (list of id or filename)",required=True) | |
10 parser.add_argument("--input", required=True) | |
11 parser.add_argument("--header") | |
12 parser.add_argument("--ncol") | |
13 parser.add_argument("--id_type") | |
14 parser.add_argument("--network_output") | |
15 parser.add_argument("--nodes_output") | |
16 args = parser.parse_args() | |
17 | |
18 if args.input_type=="file" : | |
19 args.ncol = nb_col_to_int(args.ncol) | |
20 args.header = str2bool(args.header) | |
21 | |
22 return args | |
23 | |
24 #Turn string into boolean | |
25 def str2bool(v): | |
26 if v.lower() in ('yes', 'true', 't', 'y', '1'): | |
27 return True | |
28 elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |
29 return False | |
30 else: | |
31 raise argparse.ArgumentTypeError('Boolean value expected.') | |
32 | |
33 #return the column number in int format | |
34 def nb_col_to_int(nb_col): | |
35 try : | |
36 nb_col = int(nb_col.replace("c", "")) - 1 | |
37 return nb_col | |
38 except : | |
39 sys.exit("Please specify the column where you would like to apply the filter with valid format") | |
40 | |
41 #return list of (unique) ids from string | |
42 def get_input_ids_from_string(input) : | |
43 ids_list = list(set(re.split(r'\s+',input.replace(";"," ").replace("\r","").replace("\n"," ").replace("\t"," ")))) | |
44 if "" in ids_list : ids_list.remove("") | |
45 #if "NA" in ids_list : ids_list.remove("NA") | |
46 return ids_list | |
47 | |
48 #return input_file and list of unique ids from input file path | |
49 def get_input_ids_from_file(input,nb_col,header) : | |
50 with open(input, "r") as csv_file : | |
51 input_file= list(csv.reader(csv_file, delimiter='\t')) | |
52 | |
53 input_file, ids_list = one_id_one_line(input_file,nb_col,header) | |
54 if "" in ids_list : ids_list.remove("") | |
55 #if "NA" in ids_list : ids_list.remove("NA") | |
56 | |
57 return input_file, ids_list | |
58 | |
59 #return input file by adding lines when there are more than one id per line | |
60 def one_id_one_line(input_file,nb_col,header) : | |
61 | |
62 if header : | |
63 new_file = [input_file[0]] | |
64 input_file = input_file[1:] | |
65 else : | |
66 new_file=[] | |
67 ids_list=[] | |
68 | |
69 for line in input_file : | |
70 if line != [] and set(line) != {''}: | |
71 line[nb_col] = re.sub(r"\s+","",line[nb_col]) | |
72 if ";" in line[nb_col] : | |
73 ids = line[nb_col].split(";") | |
74 for id in ids : | |
75 new_file.append(line[:nb_col]+[id]+line[nb_col+1:]) | |
76 ids_list.append(id) | |
77 else : | |
78 new_file.append(line) | |
79 ids_list.append(line[nb_col]) | |
80 | |
81 ids_list= list(set(ids_list)) | |
82 | |
83 return new_file, ids_list | |
84 | |
85 #replace all blank cells to NA | |
86 def blank_to_NA(csv_file) : | |
87 tmp=[] | |
88 for line in csv_file : | |
89 line = ["NA" if cell=="" or cell==" " or cell=="NaN" or cell=="-" else cell for cell in line] | |
90 tmp.append(line) | |
91 | |
92 return tmp | |
93 | |
94 def biogrid_output_files(ids,species) : | |
95 network_file=[["Entrez Gene Interactor A","Entrez Gene Interactor B","Gene symbol Interactor A","Gene symbol Interactor B","Experimental System","Experimental Type","Pubmed ID","Interaction Score","Phenotypes"]] | |
96 ids_set= set(ids) | |
97 ids_not_found=set([]) | |
98 for id in ids : | |
99 if id in ppi_dict['network'] : | |
100 network_file.extend(ppi_dict['network'][id]) | |
101 ids_set.update([interact[1] for interact in ppi_dict['network'][id]]) | |
102 else : | |
103 ids_not_found.add(id) | |
104 | |
105 nodes_file = [["Entrez gene ID","Official Symbol Interactor","Present in user input ids","ID present in Biogrid "+species,"Pathway"]] | |
106 for id in ids_set: | |
107 #get pathway | |
108 if id in ppi_dict['nodes']: | |
109 description_pathway=";".join(ppi_dict['nodes'][id]) | |
110 else : | |
111 description_pathway="NA" | |
112 | |
113 #get gene name | |
114 if id in ppi_dict['network']: gene_name = ppi_dict['network'][id][0][2] | |
115 else : gene_name="NA" | |
116 | |
117 #make line | |
118 nodes_file.append([id]+[gene_name]+[id in ids]+[id not in ids_not_found]+[description_pathway]) | |
119 | |
120 return network_file,nodes_file | |
121 | |
122 def bioplex_output_files(ids,id_type,species) : | |
123 network_file=[[id_type+" Interactor A",id_type+" Interactor B","Gene symbol Interactor A","Gene symbol Interactor B","Interaction Score"]] | |
124 ids_set= set(ids) | |
125 ids_not_found=set([]) | |
126 for id in ids : | |
127 if id in ppi_dict['network'][id_type] : | |
128 network_file.extend(ppi_dict['network'][id_type][id]) | |
129 ids_set.update([interact[1] for interact in ppi_dict['network'][id_type][id]]) | |
130 else : | |
131 ids_not_found.add(id) | |
132 | |
133 if id_type=="UniProt-AC" : nodes_file=[[id_type,"Present in user input ids","ID present in Human Bioplex","Pathway"]] | |
134 else: nodes_file=[[id_type,"Official symbol Interactor","Present in user input ids","Present in interactome","Pathway"]] | |
135 for id in ids_set: | |
136 | |
137 if id in ppi_dict['nodes'][id_type]: | |
138 description_pathway=";".join(ppi_dict['nodes'][id_type][id]) | |
139 else : | |
140 description_pathway="NA" | |
141 | |
142 #make line | |
143 if id_type=="UniProt-AC": | |
144 nodes_file.append([id]+[id in ids]+[id not in ids_not_found]+[description_pathway]) | |
145 elif id_type=="GeneID": | |
146 #get gene_name | |
147 if id in ppi_dict['network'][id_type]: gene_name = ppi_dict['network'][id_type][id][0][2] | |
148 else : gene_name="NA" | |
149 nodes_file.append([id]+[gene_name]+[id in ids]+[id not in ids_not_found]+[description_pathway]) | |
150 | |
151 return network_file,nodes_file | |
152 | |
153 def humap_output_files(ids,species) : | |
154 network_file=[["Entrez Gene Interactor A","Entrez Gene Interactor B","Gene symbol Interactor A","Gene symbol Interactor B","Interaction Score"]] | |
155 ids_set= set(ids) | |
156 ids_not_found=set([]) | |
157 for id in ids : | |
158 if id in ppi_dict['network'] : | |
159 network_file.extend(ppi_dict['network'][id]) | |
160 ids_set.update([interact[1] for interact in ppi_dict['network'][id]]) | |
161 else : | |
162 ids_not_found.add(id) | |
163 | |
164 nodes_file = [["Entrez gene ID","Official Symbol Interactor","Present in user input ids","ID present in Hu.MAP","Pathway"]] | |
165 for id in ids_set: | |
166 if id in ppi_dict['nodes']: | |
167 description_pathway=";".join(ppi_dict['nodes'][id]) | |
168 else : | |
169 description_pathway="NA" | |
170 | |
171 #get gene name | |
172 if id in ppi_dict['gene_name']: | |
173 gene_name = ppi_dict['gene_name'][id] | |
174 else : | |
175 gene_name = "NA" | |
176 | |
177 #make line | |
178 nodes_file.append([id]+[gene_name]+[id in ids]+[id not in ids_not_found]+[description_pathway]) | |
179 | |
180 return network_file,nodes_file | |
181 | |
182 #function to sort the csv_file by value in a specific column | |
183 def sort_by_column(tab,sort_col,reverse,header): | |
184 | |
185 if len(tab) > 1 : #if there's more than just a header or 1 row | |
186 if header : | |
187 head=tab[0] | |
188 tab=tab[1:] | |
189 | |
190 #list of empty cells in the column to sort | |
191 unsortable_lines = [i for i,line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] | |
192 unsorted_tab=[ tab[i] for i in unsortable_lines] | |
193 tab= [line for i,line in enumerate(tab) if i not in unsortable_lines] | |
194 | |
195 if only_number(tab,sort_col) and any_float(tab,sort_col) : | |
196 tab = sorted(tab, key=lambda row: float(row[sort_col]), reverse=reverse) | |
197 elif only_number(tab,sort_col): | |
198 tab = sorted(tab, key=lambda row: int(row[sort_col]), reverse=reverse) | |
199 else : | |
200 tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) | |
201 | |
202 tab.extend(unsorted_tab) | |
203 if header is True : tab = [head]+tab | |
204 | |
205 return tab | |
206 | |
207 def only_number(tab,col) : | |
208 | |
209 for line in tab : | |
210 if not (is_number("float",line[col].replace(",",".")) or is_number("int",line[col].replace(",","."))) : | |
211 return False | |
212 return True | |
213 | |
214 #Check if a variable is a float or an integer | |
215 def is_number(number_format, n): | |
216 float_format = re.compile(r"^[-]?[0-9][0-9]*.?[0-9]+$") | |
217 int_format = re.compile(r"^[-]?[0-9][0-9]*$") | |
218 test = "" | |
219 if number_format == "int": | |
220 test = re.match(int_format, n) | |
221 elif number_format == "float": | |
222 test = re.match(float_format, n) | |
223 if test: | |
224 return True | |
225 | |
226 #return True is there is at least one float in the column | |
227 def any_float(tab,col) : | |
228 | |
229 for line in tab : | |
230 if is_number("float",line[col].replace(",",".")) : | |
231 return True | |
232 | |
233 return False | |
234 | |
235 def main() : | |
236 | |
237 #Get args from command line | |
238 global args | |
239 args = get_args() | |
240 | |
241 #get PPI dictionary | |
242 with open(args.dict_path, 'r') as handle: | |
243 global ppi_dict | |
244 ppi_dict = json.load(handle) | |
245 | |
246 #Get file and/or ids from input | |
247 if args.input_type == "text" : | |
248 ids = get_input_ids_from_string(args.input) | |
249 elif args.input_type == "file" : | |
250 input_file, ids = get_input_ids_from_file(args.input,args.ncol,args.header) | |
251 | |
252 #create output files | |
253 if args.database=="biogrid": | |
254 network_file, nodes_file = biogrid_output_files(ids,args.species) | |
255 elif args.database=="bioplex": | |
256 network_file, nodes_file = bioplex_output_files(ids,args.id_type,args.species) | |
257 elif args.database=="humap": | |
258 network_file, nodes_file = humap_output_files(ids,args.species) | |
259 | |
260 #convert blank to NA and sort files | |
261 network_file = blank_to_NA(network_file) | |
262 network_file = sort_by_column(network_file,0,False,True) | |
263 nodes_file = sort_by_column(nodes_file,0,False,True) | |
264 | |
265 #write output files | |
266 with open(args.network_output,"w") as output : | |
267 writer = csv.writer(output,delimiter="\t") | |
268 writer.writerows(network_file) | |
269 | |
270 with open(args.nodes_output,"w") as output : | |
271 writer = csv.writer(output,delimiter="\t") | |
272 for row in nodes_file: | |
273 writer.writerow([unicode(s).encode("utf-8") for s in row]) | |
274 | |
275 if __name__ == "__main__": | |
276 main() |