Mercurial > repos > proteore > proteore_id_converter
comparison id_converter.py @ 18:5252bbcfbdd7 draft
planemo upload commit dc6d9d8f1808c4c6bcc42ac6861a8b811e4cee58-dirty
author | proteore |
---|---|
date | Fri, 10 May 2019 10:38:46 -0400 |
parents | 1e45ea50f145 |
children | 9d758344d36e |
comparison
equal
deleted
inserted
replaced
17:1e45ea50f145 | 18:5252bbcfbdd7 |
---|---|
1 import sys, os, argparse, re, csv | 1 import sys, os, argparse, re, csv, itertools |
2 | 2 |
3 def get_args() : | 3 def get_args() : |
4 parser = argparse.ArgumentParser() | 4 parser = argparse.ArgumentParser() |
5 parser.add_argument("-d", "--ref_file", help="path to reference file: <species>_id_mapping.tsv", required=True) | 5 parser.add_argument("-d", "--ref_file", help="path to reference file: <species>_id_mapping.tsv", required=True) |
6 parser.add_argument("--input_type", help="type of input (list of id or filename)", required=True) | 6 parser.add_argument("--input_type", help="type of input (list of id or filename)", required=True) |
56 | 56 |
57 ids_list= list(set(ids_list)) | 57 ids_list= list(set(ids_list)) |
58 | 58 |
59 return new_file, ids_list | 59 return new_file, ids_list |
60 | 60 |
61 def output_one_id_one_line(line,convert_ids,target_ids): | |
62 | |
63 ids_not_processed = ["GI","PDB","GO","PIR","MIM","UniGene","BioGrid","STRING"] #ids with multiple ids per line in output file | |
64 ids_not_processed = [id for id in ids_not_processed if id in target_ids] #ids present in target_ids with multiple ids per line in output file | |
65 | |
66 for id_not_processed in ids_not_processed : | |
67 index = target_ids.index(id_not_processed) | |
68 convert_ids[index] = [";".join(convert_ids[index])] | |
69 | |
70 res = itertools.product(*convert_ids) #getting all possibilities between lists of ids | |
71 res = [list(e) for e in res] #convert to lists | |
72 res = [line+list(ids) for ids in res] #adding the rest of the line | |
73 | |
74 return(res) | |
75 | |
61 #return the column number in int format | 76 #return the column number in int format |
62 def nb_col_to_int(nb_col): | 77 def nb_col_to_int(nb_col): |
63 try : | 78 try : |
64 nb_col = int(nb_col.replace("c", "")) - 1 | 79 nb_col = int(nb_col.replace("c", "")) - 1 |
65 return nb_col | 80 return nb_col |
88 | 103 |
89 result_dict = {} | 104 result_dict = {} |
90 for id in ids : | 105 for id in ids : |
91 for target_id in id_out : | 106 for target_id in id_out : |
92 if id in ids_dictionary : | 107 if id in ids_dictionary : |
93 res = ";".join(ids_dictionary[id][target_id]) | 108 res = ids_dictionary[id][target_id] |
94 else : | 109 else : |
95 res="" | 110 res="" |
96 | 111 |
97 if id in result_dict : | 112 if id in result_dict : |
98 result_dict[id].append(res) | 113 result_dict[id].append(res) |
147 else : | 162 else : |
148 ids_dictionary[id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";")) | 163 ids_dictionary[id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";")) |
149 if len(ids_dictionary[id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[id][ids_dictionary_index[other_id_type]] : | 164 if len(ids_dictionary[id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[id][ids_dictionary_index[other_id_type]] : |
150 ids_dictionary[id][ids_dictionary_index[other_id_type]].remove('') | 165 ids_dictionary[id][ids_dictionary_index[other_id_type]].remove('') |
151 | 166 |
167 print ("dictionary created") | |
168 | |
152 #Get file and/or ids from input | 169 #Get file and/or ids from input |
153 if args.input_type == "list" : | 170 if args.input_type == "list" : |
154 ids = get_input_ids_from_string(args.input) | 171 ids = get_input_ids_from_string(args.input) |
155 elif args.input_type == "file" : | 172 elif args.input_type == "file" : |
156 input_file, ids = get_input_ids_from_file(args.input,args.column_number,header) | 173 input_file, ids = get_input_ids_from_file(args.input,args.column_number,header) |
157 | 174 |
175 print ("starting mapping") | |
176 | |
158 #Mapping ids | 177 #Mapping ids |
159 result_dict = map_to_dictionary(ids,ids_dictionary,args.id_type,target_ids) | 178 result_dict = map_to_dictionary(ids,ids_dictionary,args.id_type,target_ids) |
160 | 179 |
180 print ("mapping done") | |
181 | |
182 print ("creating output file") | |
161 #creating output file | 183 #creating output file |
162 if header : | |
163 output_file=[input_file[0]+target_ids] | |
164 input_file = input_file[1:] | |
165 else : | |
166 output_file=[[args.id_type]+target_ids] | |
167 | |
168 if args.input_type=="file" : | |
169 for line in input_file : | |
170 output_file.append(line+result_dict[line[args.column_number]]) | |
171 elif args.input_type=="list" : | |
172 for id in ids : | |
173 output_file.append([id]+result_dict[id]) | |
174 | |
175 #convert blank to NA | |
176 output_file = blank_to_NA(output_file) | |
177 | |
178 #write output file | |
179 with open(args.output,"w") as output : | 184 with open(args.output,"w") as output : |
180 writer = csv.writer(output,delimiter="\t") | 185 writer = csv.writer(output,delimiter="\t") |
181 writer.writerows(output_file) | 186 #writer.writerows(output_file) |
187 | |
188 #write header | |
189 if header : | |
190 writer.writerow(input_file[0]+target_ids) | |
191 input_file = input_file[1:] | |
192 else : | |
193 writer.writerow([args.id_type]+target_ids) | |
194 | |
195 #write lines | |
196 if args.input_type=="file" : | |
197 for line in input_file : | |
198 tmp = output_one_id_one_line(line,result_dict[line[args.column_number]],target_ids) | |
199 tmp = blank_to_NA(tmp) | |
200 for row in tmp : | |
201 writer.writerow(row) | |
202 elif args.input_type=="list" : | |
203 for id in ids : | |
204 tmp = output_one_id_one_line([id],result_dict[id],target_ids) | |
205 tmp = blank_to_NA(tmp) | |
206 for row in tmp : | |
207 writer.writerow(row) | |
208 | |
209 print ("output file created") | |
182 | 210 |
183 if __name__ == "__main__": | 211 if __name__ == "__main__": |
184 main() | 212 main() |
185 | 213 |