Mercurial > repos > proteore > proteore_id_converter
diff id_converter.py @ 19:9d758344d36e draft
planemo upload commit 339ab77a83db03409c8001324f10b36ff5b13a39-dirty
author | proteore |
---|---|
date | Wed, 19 Jun 2019 04:38:12 -0400 |
parents | 5252bbcfbdd7 |
children | 6e65e1c78705 |
line wrap: on
line diff
--- a/id_converter.py Fri May 10 10:38:46 2019 -0400 +++ b/id_converter.py Wed Jun 19 04:38:12 2019 -0400 @@ -58,9 +58,11 @@ return new_file, ids_list +#not used def output_one_id_one_line(line,convert_ids,target_ids): - ids_not_processed = ["GI","PDB","GO","PIR","MIM","UniGene","BioGrid","STRING"] #ids with multiple ids per line in output file + #ids_not_processed = ["GI","PDB","GO","PIR","MIM","UniGene","BioGrid","STRING"] #ids with multiple ids per line in output file + ids_not_processed = ["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"] # All Ids ids_not_processed = [id for id in ids_not_processed if id in target_ids] #ids present in target_ids with multiple ids per line in output file for id_not_processed in ids_not_processed : @@ -125,6 +127,13 @@ return(ids_dictionary,ids_dictionary_index) +def create_header(input_file,ncol,id_type,target_ids): + col_names = list(range(1,len(input_file[0])+1)) + col_names = ["col"+str(e) for e in col_names] + col_names[ncol]=id_type + col_names = col_names+target_ids + return(col_names) + def main(): #Get args from command line @@ -158,13 +167,13 @@ ids_dictionary[id]={} for other_id_type in other_id_type_index : if ids_dictionary_index[other_id_type] not in ids_dictionary[id] : - ids_dictionary[id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";")) + ids_dictionary[id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace("NA","").replace(" ","").split(";")) else : - ids_dictionary[id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";")) + ids_dictionary[id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace("NA","").replace(" ","").split(";")) if len(ids_dictionary[id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[id][ids_dictionary_index[other_id_type]] : ids_dictionary[id][ids_dictionary_index[other_id_type]].remove('') - print ("dictionary created") + #print ("dictionary created") #Get file and/or ids from input if args.input_type == "list" : @@ -172,14 +181,14 @@ elif args.input_type == "file" : input_file, ids = get_input_ids_from_file(args.input,args.column_number,header) - print ("starting mapping") + #print ("starting mapping") #Mapping ids result_dict = map_to_dictionary(ids,ids_dictionary,args.id_type,target_ids) - print ("mapping done") + #print ("mapping done") - print ("creating output file") + #print ("creating output file") #creating output file with open(args.output,"w") as output : writer = csv.writer(output,delimiter="\t") @@ -188,25 +197,31 @@ #write header if header : writer.writerow(input_file[0]+target_ids) - input_file = input_file[1:] - else : + input_file = input_file[1:] + elif args.input_type=="file": + col_names = create_header(input_file,args.column_number,args.id_type,target_ids) + writer.writerow(col_names) + else : writer.writerow([args.id_type]+target_ids) #write lines + previous_line="" if args.input_type=="file" : for line in input_file : - tmp = output_one_id_one_line(line,result_dict[line[args.column_number]],target_ids) - tmp = blank_to_NA(tmp) - for row in tmp : - writer.writerow(row) + res = [";".join(list(res_ids)) for res_ids in result_dict[line[args.column_number]]] + line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line+res] + if previous_line != line : + writer.writerow(line) + previous_line=line elif args.input_type=="list" : for id in ids : - tmp = output_one_id_one_line([id],result_dict[id],target_ids) - tmp = blank_to_NA(tmp) - for row in tmp : - writer.writerow(row) + res = [";".join(list(res_ids)) for res_ids in result_dict[id]] + line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in [id]+res] + if previous_line != line : + writer.writerow(line) + previous_line=line - print ("output file created") + #print ("output file created") if __name__ == "__main__": main()