diff id_converter.py @ 19:9d758344d36e draft

planemo upload commit 339ab77a83db03409c8001324f10b36ff5b13a39-dirty
author proteore
date Wed, 19 Jun 2019 04:38:12 -0400
parents 5252bbcfbdd7
children 6e65e1c78705
line wrap: on
line diff
--- a/id_converter.py	Fri May 10 10:38:46 2019 -0400
+++ b/id_converter.py	Wed Jun 19 04:38:12 2019 -0400
@@ -58,9 +58,11 @@
 
     return new_file, ids_list
 
+#not used
 def output_one_id_one_line(line,convert_ids,target_ids):
 
-    ids_not_processed = ["GI","PDB","GO","PIR","MIM","UniGene","BioGrid","STRING"]  #ids with multiple ids per line in output file
+    #ids_not_processed = ["GI","PDB","GO","PIR","MIM","UniGene","BioGrid","STRING"]  #ids with multiple ids per line in output file
+    ids_not_processed = ["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"] # All Ids
     ids_not_processed = [id for id in ids_not_processed if id in target_ids]    #ids present in target_ids with multiple ids per line in output file
 
     for id_not_processed in ids_not_processed :
@@ -125,6 +127,13 @@
             
     return(ids_dictionary,ids_dictionary_index)
 
+def create_header(input_file,ncol,id_type,target_ids):
+    col_names = list(range(1,len(input_file[0])+1))
+    col_names = ["col"+str(e) for e in col_names]
+    col_names[ncol]=id_type
+    col_names = col_names+target_ids
+    return(col_names)
+
 def main():
     
     #Get args from command line
@@ -158,13 +167,13 @@
                 ids_dictionary[id]={}
             for other_id_type in other_id_type_index :
                 if ids_dictionary_index[other_id_type] not in ids_dictionary[id] :
-                    ids_dictionary[id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace(" ","").split(";"))
+                    ids_dictionary[id][ids_dictionary_index[other_id_type]] = set(line[other_id_type].replace("NA","").replace(" ","").split(";"))
                 else :
-                    ids_dictionary[id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace(" ","").split(";"))
+                    ids_dictionary[id][ids_dictionary_index[other_id_type]] |= set(line[other_id_type].replace("NA","").replace(" ","").split(";"))
                 if len(ids_dictionary[id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[id][ids_dictionary_index[other_id_type]] : 
                     ids_dictionary[id][ids_dictionary_index[other_id_type]].remove('')
 
-    print ("dictionary created")
+    #print ("dictionary created")
 
     #Get file and/or ids from input 
     if args.input_type == "list" :
@@ -172,14 +181,14 @@
     elif args.input_type == "file" :
         input_file, ids = get_input_ids_from_file(args.input,args.column_number,header)
 
-    print ("starting mapping")
+    #print ("starting mapping")
 
     #Mapping ids
     result_dict = map_to_dictionary(ids,ids_dictionary,args.id_type,target_ids)
 
-    print ("mapping done")
+    #print ("mapping done")
 
-    print ("creating output file")
+    #print ("creating output file")
     #creating output file 
     with open(args.output,"w") as output :
         writer = csv.writer(output,delimiter="\t")
@@ -188,25 +197,31 @@
         #write header
         if header : 
             writer.writerow(input_file[0]+target_ids)
-            input_file = input_file[1:]
-        else :
+            input_file = input_file[1:]  
+        elif args.input_type=="file":
+            col_names = create_header(input_file,args.column_number,args.id_type,target_ids)
+            writer.writerow(col_names)
+        else : 
             writer.writerow([args.id_type]+target_ids)
 
         #write lines 
+        previous_line=""
         if args.input_type=="file" :
             for line in input_file :
-                tmp = output_one_id_one_line(line,result_dict[line[args.column_number]],target_ids)
-                tmp = blank_to_NA(tmp)
-                for row in tmp :
-                    writer.writerow(row)
+                res = [";".join(list(res_ids)) for res_ids in result_dict[line[args.column_number]]]
+                line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line+res]
+                if previous_line != line :
+                    writer.writerow(line)
+                    previous_line=line
         elif args.input_type=="list" :
             for id in ids :
-                tmp = output_one_id_one_line([id],result_dict[id],target_ids)
-                tmp = blank_to_NA(tmp)
-                for row in tmp :
-                    writer.writerow(row)
+                res = [";".join(list(res_ids)) for res_ids in result_dict[id]]
+                line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in [id]+res]
+                if previous_line != line :
+                    writer.writerow(line)
+                    previous_line=line
 
-        print ("output file created")
+        #print ("output file created")
 
 if __name__ == "__main__":
     main()