Mercurial > repos > proteore > proteore_id_converter

diff id_converter.py @ 18:5252bbcfbdd7 draft
planemo upload commit dc6d9d8f1808c4c6bcc42ac6861a8b811e4cee58-dirty
author: proteore
date: Fri, 10 May 2019 10:38:46 -0400
parents: 1e45ea50f145
children: 9d758344d36e
--- a/id_converter.py	Thu Mar 07 07:49:18 2019 -0500
+++ b/id_converter.py	Fri May 10 10:38:46 2019 -0400
@@ -1,4 +1,4 @@
-import sys, os, argparse, re, csv
+import sys, os, argparse, re, csv, itertools
 
 def get_args() :
     parser = argparse.ArgumentParser()
@@ -58,6 +58,21 @@
 
     return new_file, ids_list
 
+def output_one_id_one_line(line,convert_ids,target_ids):
+
+    ids_not_processed = ["GI","PDB","GO","PIR","MIM","UniGene","BioGrid","STRING"]  #ids with multiple ids per line in output file
+    ids_not_processed = [id for id in ids_not_processed if id in target_ids]    #ids present in target_ids with multiple ids per line in output file
+
+    for id_not_processed in ids_not_processed :
+        index = target_ids.index(id_not_processed)
+        convert_ids[index] = [";".join(convert_ids[index])]
+
+    res = itertools.product(*convert_ids)   #getting all possibilities between lists of ids
+    res = [list(e) for e in res]            #convert to lists
+    res = [line+list(ids) for ids in res]   #adding the rest of the line
+
+    return(res)
+        
 #return the column number in int format
 def nb_col_to_int(nb_col):
     try :
@@ -90,7 +105,7 @@
     for id in ids : 
         for target_id in id_out :
             if id in ids_dictionary :
-                res = ";".join(ids_dictionary[id][target_id])
+                res = ids_dictionary[id][target_id]
             else :
                 res=""
             
@@ -149,36 +164,49 @@
                 if len(ids_dictionary[id][ids_dictionary_index[other_id_type]]) > 1 and '' in ids_dictionary[id][ids_dictionary_index[other_id_type]] : 
                     ids_dictionary[id][ids_dictionary_index[other_id_type]].remove('')
 
+    print ("dictionary created")
+
     #Get file and/or ids from input 
     if args.input_type == "list" :
         ids = get_input_ids_from_string(args.input)
     elif args.input_type == "file" :
         input_file, ids = get_input_ids_from_file(args.input,args.column_number,header)
 
+    print ("starting mapping")
+
     #Mapping ids
     result_dict = map_to_dictionary(ids,ids_dictionary,args.id_type,target_ids)
 
-    #creating output file 
-    if header : 
-        output_file=[input_file[0]+target_ids]
-        input_file = input_file[1:]
-    else :
-        output_file=[[args.id_type]+target_ids]
+    print ("mapping done")
 
-    if args.input_type=="file" :
-        for line in input_file :
-            output_file.append(line+result_dict[line[args.column_number]])
-    elif args.input_type=="list" :
-        for id in ids :
-            output_file.append([id]+result_dict[id])
-
-    #convert blank to NA
-    output_file = blank_to_NA(output_file)
-
-    #write output file 
+    print ("creating output file")
+    #creating output file 
     with open(args.output,"w") as output :
         writer = csv.writer(output,delimiter="\t")
-        writer.writerows(output_file)
+        #writer.writerows(output_file)
+
+        #write header
+        if header : 
+            writer.writerow(input_file[0]+target_ids)
+            input_file = input_file[1:]
+        else :
+            writer.writerow([args.id_type]+target_ids)
+
+        #write lines 
+        if args.input_type=="file" :
+            for line in input_file :
+                tmp = output_one_id_one_line(line,result_dict[line[args.column_number]],target_ids)
+                tmp = blank_to_NA(tmp)
+                for row in tmp :
+                    writer.writerow(row)
+        elif args.input_type=="list" :
+            for id in ids :
+                tmp = output_one_id_one_line([id],result_dict[id],target_ids)
+                tmp = blank_to_NA(tmp)
+                for row in tmp :
+                    writer.writerow(row)
+
+        print ("output file created")
 
 if __name__ == "__main__":
     main()
author	proteore
date	Fri, 10 May 2019 10:38:46 -0400
parents	1e45ea50f145
children	9d758344d36e