Mercurial > repos > jaredgk > ppp_vcfphase

--- a/beagle.py	Tue Oct 16 18:03:51 2018 -0400
+++ b/beagle.py	Wed Oct 17 17:20:47 2018 -0400
@@ -144,7 +144,7 @@
         output_format : str
             Output file format
     '''
-    print beagle_call_args
+    print (beagle_call_args)
     # Standard call to beagle
     standard_beagle_call(beagle_path, beagle_call_args, output_prefix)
--- a/model.py	Tue Oct 16 18:03:51 2018 -0400
+++ b/model.py	Wed Oct 17 17:20:47 2018 -0400
@@ -5,8 +5,11 @@
 import argparse
 import logging
 import itertools
+import copy

-from collections import defaultdict
+import numpy as np
+
+from collections import defaultdict, OrderedDict

 # Insert Jared's directory path, required for calling Jared's functions. Change when directory structure changes.
 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))
@@ -20,12 +23,72 @@
         self.ind_file = ''
         self.exclude_file = ''

-    def assign_inds (self, inds = []):
-        # Return error if inds is empty
-        if not inds:
-            raise IOError('No individuals found in the model file.')
+        if arg and self.confirm_model_instance(arg[1]):
+            self.update_inds(arg[1])
+
+    def __setitem__(self, *arg, **kw):
+        super(ModelFile, self).__setitem__(*arg, **kw)
+
+        if arg and self.confirm_model_instance(arg[1]):
+            self.update_inds(model = arg[1])
+
+    def __delitem__(self, key):
+        super(ModelFile, self).__delitem__(key)
+        self.update_inds()
+
+    def confirm_model_instance (self, unknown):
+
+        if isinstance(unknown, Model):
+
+            return True
+
+        else:
+
+            return False
+
+    def copy_model (self, src_model_name, new_model_name):
+
+        src_model = super(ModelFile, self).__getitem__(src_model_name)
+
+        src_model_copy = copy.deepcopy(src_model)
+
+        src_model_copy.name = new_model_name
+
+        super(ModelFile, self).__setitem__(new_model_name, src_model_copy)
+
+    def rename_model (self, src_model_name, new_model_name):
+
+        src_model = super(ModelFile, self).pop(src_model_name)
+
+        src_model.name = new_model_name
+
+        super(ModelFile, self).__setitem__(new_model_name, src_model)
+
+    def update_inds (self, model = None):
+
+        if self.confirm_model_instance(model):
+
+            # Return error if inds is empty
+            if not model.inds:
+                raise IOError('No individuals found in %s.' % model.name)
+
+            # Create a list of the unique individuals
+            unique_inds = list(set(self.inds + model.inds))
+
+        else:
+
+            # Create an empty list for the unique individuals
+            unique_inds = []
+
+            # Loop the models in the file
+            for model_in_file in super(ModelFile, self).values():
+
+                # Create a list of the unique individuals
+                unique_inds = list(set(unique_inds + model_in_file.inds))
+
+
         # Store the individuals
-        self.inds = [str(ind) for ind in inds]
+        self.inds = unique_inds

     def create_ind_file (self, file_ext = '', file_path = '', overwrite = False):
         # Assign the filename for the population file
@@ -84,7 +147,7 @@
         # Save the individuals filename
         self.exclude_file = ind_filename

-    def delete_ind_file (self):
+    def delete_exclude_ind_file (self):
         # Check if an individuals file was created
         if self.exclude_file:

@@ -94,30 +157,102 @@
             # Remove the filename
             self.exclude_file = ''

+    def to_json (self):
+
+        model_file_json = []
+
+        for model_name, model_data in super(ModelFile, self).items():
+            model_file_json.append(model_data.to_json())
+
+        return model_file_json
+
+
 class Model:
     def __init__ (self, name):
         self.name = name
         self.tree = ''
-        self.npop = 0
         self.pop_list = []
+        self.ind_dict = defaultdict(list)
         self.nind = defaultdict(int)
-        self.ind_dict = defaultdict(list)
         self.pop_files = []
         self.ind_file = ''

     @property
-    def inds(self):
+    def npop (self):
+        return len(self.pop_list)
+
+    @property
+    def inds (self):
         return list(itertools.chain.from_iterable(self.ind_dict.values()))

     def assign_tree (self, tree):
         self.tree = str(tree)

     def assign_pop (self, pop, inds = []):
-        self.npop += 1
         self.pop_list.append(str(pop))
         if inds:
-            self.nind[pop] = len(inds)
             self.ind_dict[pop] = [str(ind) for ind in inds]
+        self.nind[pop] = len(self.ind_dict[pop])
+
+    def sample_pop (self, pop, sample_size, with_replacements = False):
+
+        # Confirm the pop is in the model
+        if str(pop) not in self.pop_list:
+
+            # Raise error if pop not found
+            raise Exception('%s not found' % pop)
+
+        # Confirm the sample size is an int
+        try:
+
+            sample_size = int(sample_size)
+
+        except:
+
+            # Raise error if sample_size not an int
+            raise Exception('%s not int' % sample_size)
+
+        # Check if the sample size is larger than the pop
+        if int(sample_size) > self.nind[pop]:
+
+            # Raise error if sample_size is larger
+            raise Exception('%s is larger than %s' % (sample_size, pop))
+
+        # Use numpy choice to randomly sample the pop
+        sampled_inds = np.random.choice(self.ind_dict[pop], sample_size, replace = with_replacements)
+
+        # Save the sampled inds as a list
+        self.ind_dict[pop] = list(sampled_inds)
+
+    def sample_pops (self, sample_size, with_replacements = False):
+
+        # Confirm the sample size is an int
+        try:
+
+            sample_size = int(sample_size)
+
+        except:
+
+            # Raise error if sample_size not an int
+            raise Exception('%s not int' % sample_size)
+
+        # Loop each pop in the pop list
+        for pop in self.pop_list:
+
+            # Check if the sample size is larger than the pop
+            if int(sample_size) > self.nind[pop]:
+
+                # Raise error if sample_size is larger
+                raise Exception('%s is larger than %s' % (sample_size, pop))
+
+        # Loop each pop in the pop list, if no error raised
+        for pop in self.pop_list:
+
+            # Use numpy choice to randomly sample the pop
+            sampled_inds = np.random.choice(self.ind_dict[pop], sample_size, replace = with_replacements)
+
+            # Save the sampled inds as a list
+            self.ind_dict[pop] = list(sampled_inds)

     def create_pop_files (self, file_ext = '', file_path = '', overwrite = False):
         for pop in self.pop_list:
@@ -186,10 +321,28 @@
             # Remove the filename
             self.ind_file = ''

-def read_model_file (model_filename):
+    def to_json (self):
+
+        model_json = OrderedDict()
+
+        model_json['name'] = self.name
+
+        pop_json = OrderedDict()
+
+        for pop in self.pop_list:
+
+            pop_json[pop] = OrderedDict()
+
+            pop_json[pop]['indv'] = self.ind_dict[pop]
+
+        model_json['pops'] = pop_json
+
+        return model_json
+
+def read_model_file (filename):

     # Check that the file exists
-    if not os.path.isfile(model_filename):
+    if not os.path.isfile(filename):
         raise IOError

     # Create ModelFile object
@@ -198,10 +351,10 @@
     # Check if using python 2 or 3
     if sys.version_info[0] == 2:
         # Open the model file in python 2
-        model_file = open(model_filename, 'rU')
+        model_file = open(filename, 'rU')
     else:
         # Open the model file in python 3
-        model_file = open(model_filename, 'r', newline=None)
+        model_file = open(filename, 'r', newline=None)

     # Parse the model file using the json reader
     models_dict = json.load(model_file)
@@ -213,15 +366,19 @@
     for model_dict in models_dict:

         # Create the model
-        model = Model(model_dict['name'])
+        model = Model(str(model_dict['name']))

         # Loop the populations in the model
         for pop, pop_dict in model_dict['pops'].items():

+            # Convert all individuals names to str
+            ind_list = [str(pop_ind) for pop_ind in pop_dict['inds']]
+
             # Assign the population ans it's individuals to the model
-            model.assign_pop(pop, pop_dict['inds'])
+            model.assign_pop(str(pop), ind_list)
+
             # Assign the individuals to the unique individual list
-            individual_list.extend(pop_dict['inds'])
+            individual_list.extend(ind_list)

         # Remove duplicates from the unique individual list
         individual_list = list(set(individual_list))
@@ -229,8 +386,27 @@
         # Save the model
         models_to_return[str(model.name)] = model

-    # Store the  unique individuals within the ModelFile object
-    models_to_return.assign_inds(individual_list)
+    logging.info('Finished reading model file (%s)' % filename)

     # Return the models
     return models_to_return
+
+def write_model_file (model_file, filename, overwrite = False):
+
+    # Check if the file is to be overwritten
+    if not overwrite:
+
+        # Check if the file exists
+        if os.path.exists(filename):
+            raise Exception('%s already exists' % filename)
+
+    # Open the output file
+    output_file = open(filename, 'w')
+
+    # Write the json-formmated data to the output file
+    output_file.write(json.dumps(model_file.to_json(), indent = 4))
+
+    # Close the output file
+    output_file.close()
+
+    logging.info('Finished writing model file (%s)' % filename)