Mercurial > repos > proteore > proteore_id_converter
diff id_converter.py @ 18:5252bbcfbdd7 draft
planemo upload commit dc6d9d8f1808c4c6bcc42ac6861a8b811e4cee58-dirty
author | proteore |
---|---|
date | Fri, 10 May 2019 10:38:46 -0400 |
parents | 1e45ea50f145 |
children | 9d758344d36e |
line wrap: on
line diff
--- a/id_converter.py Thu Mar 07 07:49:18 2019 -0500 +++ b/id_converter.py Fri May 10 10:38:46 2019 -0400 @@ -1,4 +1,4 @@ -import sys, os, argparse, re, csv +import sys, os, argparse, re, csv, itertools def get_args() : parser = argparse.ArgumentParser() @@ -58,6 +58,21 @@ return new_file, ids_list +def output_one_id_one_line(line,convert_ids,target_ids): + + ids_not_processed = ["GI","PDB","GO","PIR","MIM","UniGene","BioGrid","STRING"] #ids with multiple ids per line in output file + ids_not_processed = [id for id in ids_not_processed if id in target_ids] #ids present in target_ids with multiple ids per line in output file + + for id_not_processed in ids_not_processed : + index = target_ids.index(id_not_processed) + convert_ids[index] = [";".join(convert_ids[index])] + + res = itertools.product(*convert_ids) #getting all possibilities between lists of ids + res = [list(e) for e in res] #convert to lists + res = [line+list(ids) for ids in res] #adding the rest of the line + + return(res) + #return the column number in int format def nb_col_to_int(nb_col): try : @@ -90,7 +105,7 @@ for id in ids : for target_id in id_out : if id in ids_dictionary : - res = ";".join(ids_dictionary[id][target_id]) + res = ids_dictionary[id][target_id] else : res="" @@ -149,36 +164,49 @@ if len(ids_dictionary[id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[id][ids_dictionary_index[other_id_type]] : ids_dictionary[id][ids_dictionary_index[other_id_type]].remove('') + print ("dictionary created") + #Get file and/or ids from input if args.input_type == "list" : ids = get_input_ids_from_string(args.input) elif args.input_type == "file" : input_file, ids = get_input_ids_from_file(args.input,args.column_number,header) + print ("starting mapping") + #Mapping ids result_dict = map_to_dictionary(ids,ids_dictionary,args.id_type,target_ids) - #creating output file - if header : - output_file=[input_file[0]+target_ids] - input_file = input_file[1:] - else : - output_file=[[args.id_type]+target_ids] + print ("mapping done") - if args.input_type=="file" : - for line in input_file : - output_file.append(line+result_dict[line[args.column_number]]) - elif args.input_type=="list" : - for id in ids : - output_file.append([id]+result_dict[id]) - - #convert blank to NA - output_file = blank_to_NA(output_file) - - #write output file + print ("creating output file") + #creating output file with open(args.output,"w") as output : writer = csv.writer(output,delimiter="\t") - writer.writerows(output_file) + #writer.writerows(output_file) + + #write header + if header : + writer.writerow(input_file[0]+target_ids) + input_file = input_file[1:] + else : + writer.writerow([args.id_type]+target_ids) + + #write lines + if args.input_type=="file" : + for line in input_file : + tmp = output_one_id_one_line(line,result_dict[line[args.column_number]],target_ids) + tmp = blank_to_NA(tmp) + for row in tmp : + writer.writerow(row) + elif args.input_type=="list" : + for id in ids : + tmp = output_one_id_one_line([id],result_dict[id],target_ids) + tmp = blank_to_NA(tmp) + for row in tmp : + writer.writerow(row) + + print ("output file created") if __name__ == "__main__": main()