comparison id_converter.py @ 18:5252bbcfbdd7 draft

planemo upload commit dc6d9d8f1808c4c6bcc42ac6861a8b811e4cee58-dirty
author proteore
date Fri, 10 May 2019 10:38:46 -0400
parents 1e45ea50f145
children 9d758344d36e
comparison
equal deleted inserted replaced
17:1e45ea50f145 18:5252bbcfbdd7
1 import sys, os, argparse, re, csv 1 import sys, os, argparse, re, csv, itertools
2 2
3 def get_args() : 3 def get_args() :
4 parser = argparse.ArgumentParser() 4 parser = argparse.ArgumentParser()
5 parser.add_argument("-d", "--ref_file", help="path to reference file: <species>_id_mapping.tsv", required=True) 5 parser.add_argument("-d", "--ref_file", help="path to reference file: <species>_id_mapping.tsv", required=True)
6 parser.add_argument("--input_type", help="type of input (list of id or filename)", required=True) 6 parser.add_argument("--input_type", help="type of input (list of id or filename)", required=True)
56 56
57 ids_list= list(set(ids_list)) 57 ids_list= list(set(ids_list))
58 58
59 return new_file, ids_list 59 return new_file, ids_list
60 60
61 def output_one_id_one_line(line,convert_ids,target_ids):
62
63 ids_not_processed = ["GI","PDB","GO","PIR","MIM","UniGene","BioGrid","STRING"] #ids with multiple ids per line in output file
64 ids_not_processed = [id for id in ids_not_processed if id in target_ids] #ids present in target_ids with multiple ids per line in output file
65
66 for id_not_processed in ids_not_processed :
67 index = target_ids.index(id_not_processed)
68 convert_ids[index] = [";".join(convert_ids[index])]
69
70 res = itertools.product(*convert_ids) #getting all possibilities between lists of ids
71 res = [list(e) for e in res] #convert to lists
72 res = [line+list(ids) for ids in res] #adding the rest of the line
73
74 return(res)
75
61 #return the column number in int format 76 #return the column number in int format
62 def nb_col_to_int(nb_col): 77 def nb_col_to_int(nb_col):
63 try : 78 try :
64 nb_col = int(nb_col.replace("c", "")) - 1 79 nb_col = int(nb_col.replace("c", "")) - 1
65 return nb_col 80 return nb_col
88 103
89 result_dict = {} 104 result_dict = {}
90 for id in ids : 105 for id in ids :
91 for target_id in id_out : 106 for target_id in id_out :
92 if id in ids_dictionary : 107 if id in ids_dictionary :
93 res = ";".join(ids_dictionary[id][target_id]) 108 res = ids_dictionary[id][target_id]
94 else : 109 else :
95 res="" 110 res=""
96 111
97 if id in result_dict : 112 if id in result_dict :
98 result_dict[id].append(res) 113 result_dict[id].append(res)
147 else : 162 else :
148 ids_dictionary[id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";")) 163 ids_dictionary[id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";"))
149 if len(ids_dictionary[id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[id][ids_dictionary_index[other_id_type]] : 164 if len(ids_dictionary[id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[id][ids_dictionary_index[other_id_type]] :
150 ids_dictionary[id][ids_dictionary_index[other_id_type]].remove('') 165 ids_dictionary[id][ids_dictionary_index[other_id_type]].remove('')
151 166
167 print ("dictionary created")
168
152 #Get file and/or ids from input 169 #Get file and/or ids from input
153 if args.input_type == "list" : 170 if args.input_type == "list" :
154 ids = get_input_ids_from_string(args.input) 171 ids = get_input_ids_from_string(args.input)
155 elif args.input_type == "file" : 172 elif args.input_type == "file" :
156 input_file, ids = get_input_ids_from_file(args.input,args.column_number,header) 173 input_file, ids = get_input_ids_from_file(args.input,args.column_number,header)
157 174
175 print ("starting mapping")
176
158 #Mapping ids 177 #Mapping ids
159 result_dict = map_to_dictionary(ids,ids_dictionary,args.id_type,target_ids) 178 result_dict = map_to_dictionary(ids,ids_dictionary,args.id_type,target_ids)
160 179
180 print ("mapping done")
181
182 print ("creating output file")
161 #creating output file 183 #creating output file
162 if header :
163 output_file=[input_file[0]+target_ids]
164 input_file = input_file[1:]
165 else :
166 output_file=[[args.id_type]+target_ids]
167
168 if args.input_type=="file" :
169 for line in input_file :
170 output_file.append(line+result_dict[line[args.column_number]])
171 elif args.input_type=="list" :
172 for id in ids :
173 output_file.append([id]+result_dict[id])
174
175 #convert blank to NA
176 output_file = blank_to_NA(output_file)
177
178 #write output file
179 with open(args.output,"w") as output : 184 with open(args.output,"w") as output :
180 writer = csv.writer(output,delimiter="\t") 185 writer = csv.writer(output,delimiter="\t")
181 writer.writerows(output_file) 186 #writer.writerows(output_file)
187
188 #write header
189 if header :
190 writer.writerow(input_file[0]+target_ids)
191 input_file = input_file[1:]
192 else :
193 writer.writerow([args.id_type]+target_ids)
194
195 #write lines
196 if args.input_type=="file" :
197 for line in input_file :
198 tmp = output_one_id_one_line(line,result_dict[line[args.column_number]],target_ids)
199 tmp = blank_to_NA(tmp)
200 for row in tmp :
201 writer.writerow(row)
202 elif args.input_type=="list" :
203 for id in ids :
204 tmp = output_one_id_one_line([id],result_dict[id],target_ids)
205 tmp = blank_to_NA(tmp)
206 for row in tmp :
207 writer.writerow(row)
208
209 print ("output file created")
182 210
183 if __name__ == "__main__": 211 if __name__ == "__main__":
184 main() 212 main()
185 213