Mercurial > repos > proteore > proteore_id_converter
comparison id_converter.py @ 19:9d758344d36e draft
planemo upload commit 339ab77a83db03409c8001324f10b36ff5b13a39-dirty
author | proteore |
---|---|
date | Wed, 19 Jun 2019 04:38:12 -0400 |
parents | 5252bbcfbdd7 |
children | 6e65e1c78705 |
comparison
equal
deleted
inserted
replaced
18:5252bbcfbdd7 | 19:9d758344d36e |
---|---|
56 | 56 |
57 ids_list= list(set(ids_list)) | 57 ids_list= list(set(ids_list)) |
58 | 58 |
59 return new_file, ids_list | 59 return new_file, ids_list |
60 | 60 |
61 #not used | |
61 def output_one_id_one_line(line,convert_ids,target_ids): | 62 def output_one_id_one_line(line,convert_ids,target_ids): |
62 | 63 |
63 ids_not_processed = ["GI","PDB","GO","PIR","MIM","UniGene","BioGrid","STRING"] #ids with multiple ids per line in output file | 64 #ids_not_processed = ["GI","PDB","GO","PIR","MIM","UniGene","BioGrid","STRING"] #ids with multiple ids per line in output file |
65 ids_not_processed = ["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"] # All Ids | |
64 ids_not_processed = [id for id in ids_not_processed if id in target_ids] #ids present in target_ids with multiple ids per line in output file | 66 ids_not_processed = [id for id in ids_not_processed if id in target_ids] #ids present in target_ids with multiple ids per line in output file |
65 | 67 |
66 for id_not_processed in ids_not_processed : | 68 for id_not_processed in ids_not_processed : |
67 index = target_ids.index(id_not_processed) | 69 index = target_ids.index(id_not_processed) |
68 convert_ids[index] = [";".join(convert_ids[index])] | 70 convert_ids[index] = [";".join(convert_ids[index])] |
123 for i,id in enumerate(ids_list) : | 125 for i,id in enumerate(ids_list) : |
124 ids_dictionary_index[i]=id | 126 ids_dictionary_index[i]=id |
125 | 127 |
126 return(ids_dictionary,ids_dictionary_index) | 128 return(ids_dictionary,ids_dictionary_index) |
127 | 129 |
130 def create_header(input_file,ncol,id_type,target_ids): | |
131 col_names = list(range(1,len(input_file[0])+1)) | |
132 col_names = ["col"+str(e) for e in col_names] | |
133 col_names[ncol]=id_type | |
134 col_names = col_names+target_ids | |
135 return(col_names) | |
136 | |
128 def main(): | 137 def main(): |
129 | 138 |
130 #Get args from command line | 139 #Get args from command line |
131 args = get_args() | 140 args = get_args() |
132 target_ids = args.target_ids.split(",") | 141 target_ids = args.target_ids.split(",") |
156 for id in ref_ids.replace(" ","").split(";") : #if there's more than one id, one key per id (example : GO) | 165 for id in ref_ids.replace(" ","").split(";") : #if there's more than one id, one key per id (example : GO) |
157 if id not in ids_dictionary : #if the key is not created yet | 166 if id not in ids_dictionary : #if the key is not created yet |
158 ids_dictionary[id]={} | 167 ids_dictionary[id]={} |
159 for other_id_type in other_id_type_index : | 168 for other_id_type in other_id_type_index : |
160 if ids_dictionary_index[other_id_type] not in ids_dictionary[id] : | 169 if ids_dictionary_index[other_id_type] not in ids_dictionary[id] : |
161 ids_dictionary[id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";")) | 170 ids_dictionary[id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace("NA","").replace(" ","").split(";")) |
162 else : | 171 else : |
163 ids_dictionary[id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";")) | 172 ids_dictionary[id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace("NA","").replace(" ","").split(";")) |
164 if len(ids_dictionary[id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[id][ids_dictionary_index[other_id_type]] : | 173 if len(ids_dictionary[id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[id][ids_dictionary_index[other_id_type]] : |
165 ids_dictionary[id][ids_dictionary_index[other_id_type]].remove('') | 174 ids_dictionary[id][ids_dictionary_index[other_id_type]].remove('') |
166 | 175 |
167 print ("dictionary created") | 176 #print ("dictionary created") |
168 | 177 |
169 #Get file and/or ids from input | 178 #Get file and/or ids from input |
170 if args.input_type == "list" : | 179 if args.input_type == "list" : |
171 ids = get_input_ids_from_string(args.input) | 180 ids = get_input_ids_from_string(args.input) |
172 elif args.input_type == "file" : | 181 elif args.input_type == "file" : |
173 input_file, ids = get_input_ids_from_file(args.input,args.column_number,header) | 182 input_file, ids = get_input_ids_from_file(args.input,args.column_number,header) |
174 | 183 |
175 print ("starting mapping") | 184 #print ("starting mapping") |
176 | 185 |
177 #Mapping ids | 186 #Mapping ids |
178 result_dict = map_to_dictionary(ids,ids_dictionary,args.id_type,target_ids) | 187 result_dict = map_to_dictionary(ids,ids_dictionary,args.id_type,target_ids) |
179 | 188 |
180 print ("mapping done") | 189 #print ("mapping done") |
181 | 190 |
182 print ("creating output file") | 191 #print ("creating output file") |
183 #creating output file | 192 #creating output file |
184 with open(args.output,"w") as output : | 193 with open(args.output,"w") as output : |
185 writer = csv.writer(output,delimiter="\t") | 194 writer = csv.writer(output,delimiter="\t") |
186 #writer.writerows(output_file) | 195 #writer.writerows(output_file) |
187 | 196 |
188 #write header | 197 #write header |
189 if header : | 198 if header : |
190 writer.writerow(input_file[0]+target_ids) | 199 writer.writerow(input_file[0]+target_ids) |
191 input_file = input_file[1:] | 200 input_file = input_file[1:] |
192 else : | 201 elif args.input_type=="file": |
202 col_names = create_header(input_file,args.column_number,args.id_type,target_ids) | |
203 writer.writerow(col_names) | |
204 else : | |
193 writer.writerow([args.id_type]+target_ids) | 205 writer.writerow([args.id_type]+target_ids) |
194 | 206 |
195 #write lines | 207 #write lines |
208 previous_line="" | |
196 if args.input_type=="file" : | 209 if args.input_type=="file" : |
197 for line in input_file : | 210 for line in input_file : |
198 tmp = output_one_id_one_line(line,result_dict[line[args.column_number]],target_ids) | 211 res = [";".join(list(res_ids)) for res_ids in result_dict[line[args.column_number]]] |
199 tmp = blank_to_NA(tmp) | 212 line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line+res] |
200 for row in tmp : | 213 if previous_line != line : |
201 writer.writerow(row) | 214 writer.writerow(line) |
215 previous_line=line | |
202 elif args.input_type=="list" : | 216 elif args.input_type=="list" : |
203 for id in ids : | 217 for id in ids : |
204 tmp = output_one_id_one_line([id],result_dict[id],target_ids) | 218 res = [";".join(list(res_ids)) for res_ids in result_dict[id]] |
205 tmp = blank_to_NA(tmp) | 219 line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in [id]+res] |
206 for row in tmp : | 220 if previous_line != line : |
207 writer.writerow(row) | 221 writer.writerow(line) |
208 | 222 previous_line=line |
209 print ("output file created") | 223 |
224 #print ("output file created") | |
210 | 225 |
211 if __name__ == "__main__": | 226 if __name__ == "__main__": |
212 main() | 227 main() |
213 | 228 |