diff pca.py @ 24:248b06e86022

Added gd_genotype datatype. Modified tools to support new datatype.
author Richard Burhans <burhans@bx.psu.edu>
date Tue, 28 May 2013 16:24:19 -0400
parents 2c498d40ecde
children 8997f2ca8c7a
line wrap: on
line diff
--- a/pca.py	Wed May 22 15:58:18 2013 -0400
+++ b/pca.py	Tue May 28 16:24:19 2013 -0400
@@ -7,6 +7,7 @@
 import sys
 from BeautifulSoup import BeautifulSoup
 import gd_composite
+import re
 
 ################################################################################
 
@@ -62,6 +63,8 @@
 
 def make_ind_file(ind_file, input):
     pops = []
+    name_map = []
+    name_idx = 0
 
     ofh = open(ind_file, 'w')
 
@@ -78,11 +81,13 @@
                 individuals = entry.ol('li')
                 for individual in individuals:
                     individual_name = individual.string.encode('utf8').strip()
-                    print >> ofh, individual_name, 'M', population_name
+                    name_map.append(individual_name)
+                    print >> ofh, 'ind_%s' % name_idx, 'M', population_name
+                    name_idx += 1
             i += 1
 
     ofh.close()
-    return pops
+    return pops, name_map
 
 def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file):
     with open(par_file, 'w') as fh:
@@ -175,6 +180,26 @@
 
     shutil.copy2('fake', coords_file)
 
+ind_regex = re.compile('ind_([0-9]+)')
+
+def fix_names(name_map, files):
+    for file in files:
+        tmp_filename = '%s.tmp' % file
+        with open(tmp_filename, 'w') as ofh:
+            with open(file) as fh:
+                for line in fh:
+                    line = line.rstrip('\r\n')
+                    match = ind_regex.search(line)
+                    if match:
+                        idx = int(match.group(1))
+                        old = 'ind_%s' % idx
+                        new = name_map[idx].replace(' ', '_')
+                        line = line.replace(old, new)
+                    print >> ofh, line
+
+        shutil.copy2(tmp_filename, file)
+        os.unlink(tmp_filename)
+        
 ################################################################################
 
 if len(sys.argv) != 5:
@@ -194,7 +219,7 @@
 do_map2snp(map_file, snp_file)
 
 ind_file = os.path.join(output_files_path, 'admix.ind')
-population_names = make_ind_file(ind_file, input)
+population_names, name_map = make_ind_file(ind_file, input)
 
 par_file = os.path.join(output_files_path, 'par.admix')
 evec_file = os.path.join(output_files_path, 'coordinates.txt')
@@ -202,6 +227,7 @@
 make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file)
 
 smartpca_stats = do_smartpca(par_file)
+fix_names(name_map, [ind_file, evec_file])
 
 do_ploteig(evec_file, population_names)
 plot_file = 'coordinates.txt.1:2.{0}.pdf'.format(':'.join(population_names))