| Next changeset 1:8c62c71ce379 (2018-10-19) |
|
Commit message:
Uploaded |
|
added:
datatypes_conf.xml logging_module.py model.py parse_functions.py ppp_model.py vcf_reader_func.py vcf_to_ima.py |
| b |
| diff -r 000000000000 -r 1ad9c146ebb4 datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Fri Oct 19 14:36:02 2018 -0400 |
| b |
| @@ -0,0 +1,9 @@ +<?xml version="1.0"?> +<datatypes> + <datatype_files> + <datatype_file name="ppp_model.py"/> + </datatype_files> + <registration> + <datatype extension="pppmodel" type="galaxy.datatypes.ppp_model:pppModel" display_in_upload="True"/> + </registration> +</datatypes> |
| b |
| diff -r 000000000000 -r 1ad9c146ebb4 logging_module.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/logging_module.py Fri Oct 19 14:36:02 2018 -0400 |
| [ |
| @@ -0,0 +1,97 @@ +import sys +import logging + + + +def initLogger(filename='pipeline.log', filelevel='INFO', + streamlevel='WARNING', resetlog=True): + """General logger initialization for PPP functions. + + Messages from WARNING level and higher will be logged to file and to + stderr. By default, INFO level will also be written to logfile. Both + levels are flexible. + + Level groupings: + ERROR: Error messages should be generated by an exception call + WARNING: Non-terminal behavior that may be unusual (i.e. lists + with no values, blank strings) + INFO: Every major (user-facing) function should have the following: + -Message for function start + -List of input arguments and options + -Basic sanity checks (dimensions of input data) + -Statements before or after major function calls + -Message for function end + DEBUG: Mainly for developer use/debugging. Generate logs for + sub-functions that match INFO level for major functions. + Possible care should be used if there are lots of loops called. + + Use: Call with either the individual function (in __name__=="__main__" + statement) or in pipeline file. + + Parameters + ---------- + filename : str ("pipeline.log") + Name of file that log will be written to + + filelevel : {'INFO','DEBUG','WARNING','ERROR'} + Set minimum level of log messages that are written to log file. + Note that this acts as a de facto minumum for 'streamlevel' as well. + + streamlevel : {'WARNING','DEBUG','INFO','ERROR'} + Set minimum level of log messages that are output to stream. + + resetlog : bool (True) + If true, will overwrite logfile when opening. Set to false if log is + being initialized multiple times + + Returns + ------- + None + + Exceptions + ---------- + If filelevel or streamlevel are not a valid logging level + + """ + log_levels = ['DEBUG','INFO','WARNING','ERROR'] + if filelevel is not None and filelevel.upper() not in log_levels: + raise Exception('filelevel value %s is not a valid level' % + filelevel) + if streamlevel is not None and streamlevel.upper() not in log_levels: + raise Exception('streamlevel value %s is not a valid level' % + streamlevel) + fmt_def = "%(asctime)s - %(funcName)s - %(levelname)s: %(message)s" + fmt_notime = "%(funcName)s - %(levelname)s: %(message)s" + fmtr = logging.Formatter(fmt=fmt_def) + fmtr_notime = logging.Formatter(fmt=fmt_notime) + filelogger = logging.getLogger() + filelogger.setLevel('INFO') + if streamlevel is not None: + s_handler = logging.StreamHandler() + s_handler.setFormatter(fmtr_notime) + s_handler.setLevel(streamlevel) + filelogger.addHandler(s_handler) + logmode = 'a' + if resetlog: + logmode = 'w' + if filelevel is not None: + f_handler = logging.FileHandler(filename,mode=logmode) + f_handler.setFormatter(fmtr) + f_handler.setLevel(filelevel) + #filelogger.setLevel(filelevel) + filelogger.addHandler(f_handler) + #Formats exception messages to be sent to appropriate loggers + def exp_handler(etype,val,tb): + logging.error("%s" % (val), exc_info=(etype,val,tb)) + + sys.excepthook = exp_handler + + +def logArgs(args, func_name=None, print_nones=False): + header = "Arguments" + if func_name is not None: + header+=" for"+func_name + for arg in vars(args): + val = vars(args)[arg] + if val is not None or print_nones: + logging.info('Argument %s: %s' % (arg,val)) |
| b |
| diff -r 000000000000 -r 1ad9c146ebb4 model.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/model.py Fri Oct 19 14:36:02 2018 -0400 |
| [ |
| @@ -0,0 +1,236 @@ +import os +import sys +import json +import subprocess +import argparse +import logging +import itertools + +from collections import defaultdict + +# Insert Jared's directory path, required for calling Jared's functions. Change when directory structure changes. +sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared'))) + +from logging_module import initLogger + +class ModelFile(dict): + def __init__(self, *arg, **kw): + super(ModelFile, self).__init__(*arg, **kw) + self.inds = [] + self.ind_file = '' + self.exclude_file = '' + + def assign_inds (self, inds = []): + # Return error if inds is empty + if not inds: + raise IOError('No individuals found in the model file.') + # Store the individuals + self.inds = [str(ind) for ind in inds] + + def create_ind_file (self, file_ext = '', file_path = '', overwrite = False): + # Assign the filename for the population file + ind_filename = 'unique_individuals' + file_ext + + # If a path is assigned, create the file at the specified location + if file_path: + ind_filename = os.path.join(file_path, ind_filename) + + # Check if previous files should be overwriten + if not overwrite: + # Check if the file already exists + if os.path.isfile(ind_filename): + raise IOError('Individuals file exists.') + + # Create the population file + ind_file = open(ind_filename, 'w') + ind_file.write('%s\n' %'\n'.join(self.inds)) + ind_file.close() + + # Save the individuals filename + self.ind_file = ind_filename + + def delete_ind_file (self): + # Check if an individuals file was created + if self.ind_file: + + # Delete the individuals file + os.remove(self.ind_file) + + # Remove the filename + self.ind_file = '' + + def create_exclude_ind_file (self, inds_to_include = [], file_ext = '', file_path = '', overwrite = False): + # Assign the filename for the population file + ind_filename = 'exclude_individuals' + file_ext + + # If a path is assigned, create the file at the specified location + if file_path: + ind_filename = os.path.join(file_path, ind_filename) + + # Check if previous files should be overwriten + if not overwrite: + # Check if the file already exists + if os.path.isfile(ind_filename): + raise IOError('Individuals file exists.') + + # Create exclude list by removing included individuals + exclude_inds = list(set(self.inds) - set(inds_to_include)) + + # Create the population file + ind_file = open(ind_filename, 'w') + ind_file.write('%s\n' %'\n'.join(exclude_inds)) + ind_file.close() + + # Save the individuals filename + self.exclude_file = ind_filename + + def delete_ind_file (self): + # Check if an individuals file was created + if self.exclude_file: + + # Delete the individuals file + os.remove(self.exclude_file) + + # Remove the filename + self.exclude_file = '' + +class Model: + def __init__ (self, name): + self.name = name + self.tree = '' + self.npop = 0 + self.pop_list = [] + self.nind = defaultdict(int) + self.ind_dict = defaultdict(list) + self.pop_files = [] + self.ind_file = '' + + @property + def inds(self): + return list(itertools.chain.from_iterable(self.ind_dict.values())) + + def assign_tree (self, tree): + self.tree = str(tree) + + def assign_pop (self, pop, inds = []): + self.npop += 1 + self.pop_list.append(str(pop)) + if inds: + self.nind[pop] = len(inds) + self.ind_dict[pop] = [str(ind) for ind in inds] + + def create_pop_files (self, file_ext = '', file_path = '', overwrite = False): + for pop in self.pop_list: + # Assign the filename for the population file + pop_filename = pop + file_ext + + # If a path is assigned, create the file at the specified location + if file_path: + pop_filename = os.path.join(file_path, pop_filename) + + # Check if previous files should be overwriten + if not overwrite: + # Check if the file already exists + if os.path.isfile(pop_filename): + raise IOError('Population file exists.') + + # Create the population file + pop_file = open(pop_filename, 'w') + pop_file.write('%s\n' %'\n'.join(self.ind_dict[pop])) + pop_file.close() + + # Save the population filename + self.pop_files.append(pop_filename) + + def delete_pop_files (self): + # Check if pop files were created + if len(self.pop_files) != 0: + + # Loop the created pop files + for pop_file in self.pop_files: + # Delete the pop file + os.remove(pop_file) + + # Remove the filenames + self.pop_files = [] + + def create_ind_file (self, file_ext = '', file_path = '', overwrite = False): + # Assign the filename for the population file + ind_filename = 'individual.keep' + file_ext + + # If a path is assigned, create the file at the specified location + if file_path: + ind_filename = os.path.join(file_path, ind_filename) + + # Check if previous files should be overwriten + if not overwrite: + # Check if the file already exists + if os.path.isfile(ind_filename): + raise IOError('Individuals file exists.') + + # Create the population file + ind_file = open(ind_filename, 'w') + ind_file.write('%s\n' %'\n'.join(self.inds)) + ind_file.close() + + # Save the individuals filename + self.ind_file = ind_filename + + def delete_ind_file (self): + # Check if an individuals file was created + if self.ind_file: + + # Delete the individuals file + os.remove(self.ind_file) + + # Remove the filename + self.ind_file = '' + +def read_model_file (model_filename): + + # Check that the file exists + if not os.path.isfile(model_filename): + raise IOError(model_filename) + + # Create ModelFile object + models_to_return = ModelFile() + + # Check if using python 2 or 3 + if sys.version_info[0] == 2: + # Open the model file in python 2 + model_file = open(model_filename, 'rU') + else: + # Open the model file in python 3 + model_file = open(model_filename, 'r', newline=None) + + # Parse the model file using the json reader + models_dict = json.load(model_file) + + # List to store all unique individuals (i.e. individuals in all models) + individual_list = [] + + # Loop the parsed models + for model_dict in models_dict: + + # Create the model + model = Model(model_dict['name']) + + # Loop the populations in the model + for pop, pop_dict in model_dict['pops'].items(): + + # Assign the population ans it's individuals to the model + model.assign_pop(pop, pop_dict['inds']) + # Assign the individuals to the unique individual list + individual_list.extend(pop_dict['inds']) + + # Remove duplicates from the unique individual list + individual_list = list(set(individual_list)) + + # Save the model + models_to_return[str(model.name)] = model + + # Store the unique individuals within the ModelFile object + models_to_return.assign_inds(individual_list) + + # Return the models + return models_to_return |
| b |
| diff -r 000000000000 -r 1ad9c146ebb4 parse_functions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parse_functions.py Fri Oct 19 14:36:02 2018 -0400 |
| [ |
| @@ -0,0 +1,184 @@ +try: + import ConfigParser +except ImportError: + import configparser as ConfigParser +import argparse +import sys + +is_python3 = (sys.version_info[0] == 3) + + + +def parseOption(val): + """Given a value read from a config file, returns appropriately formatted + python variable. + + -If `val` == 'None', returns None variable + -If `val` == 'True', returns True + -If `val` == 'False', returns False + -If `val` is a float, returns float-casted val + -If `val` is an int and not a float, returns an int-casted val + -If none of these criteria are met, returns the input value + + Parameters + ---------- + val : str + String from dict created by config parser + + Output + ------ + val : multi + Parsed config value for use with argparse + + + """ + if val == 'None': + return None + if val == 'True': + return True + if val == 'False': + return False + try: + a = int(val) + b = float(val) + if a == b: + return int(val) + return float(val) + except: + pass + return val + +def getConfigFilename(arglist, flag='--conf'): + """Reads argument list to determine if there is a config file provided. + + Used to work with argparse. If `flag` is present in the input array, + the next argument is assumed to be the filename for the config file. + + Parameters + ---------- + arglist : list + List of all arguments to input function, from either sys.argv or a + custom list + flag : str (optional) ('--conf') + Command-line flag to look for in arglist to signify next element + is filename for config options + + Returns + ------- + config_filename : str (None) + Name of config file if 'flag' is present in arglist, otherwise + defaults to None + is_first : bool + Will be true if `flag` is first element of arglist. Used for method + to allow required args to be passed from config file. + + """ + config_filename = None + is_first = False + if flag in arglist: + idx = arglist.index(flag) + config_filename = arglist[idx+1] + if idx == 0: + is_first = True + return config_filename,is_first + return None,is_first + + +def defaultsDictForFunction(func_name, config_name): + """Creates a dict with default values for a given function. + + Given an input config file and a function name, this function will + read in all values from the given section of the config file, then + remove the variables that are used in the config file to define + parameters for multiple functions. + + Parameters + ---------- + func_name : str + Name of function section of config file. (May not necessarily + be name of function in code) + config_name : str + Name of config file + + Returns + ------- + d_func : dict + Dictionary with all attributes for the given function. Values written + as 'None' in config file will not be added, so normal defaults + can be used. + + + """ + if is_python3: + config = ConfigParser.ConfigParser() + else: + config = ConfigParser.SafeConfigParser() + config.read(config_name) + d_vars = dict(config.items("DEFAULT")) + d_hold = dict(config.items(func_name)) + d_func = {} + for k in d_hold.keys(): + if k not in d_vars: + val = parseOption(d_hold[k]) + if val is not None: + d_func[k] = val + return d_func + +def makeRequiredList(argdict, reqlist): + """Grabs required args from argument dict created by configparser. + + Generates a list of required parameters that will be passed in lieu of + a regular arglist or sys.argv. Must be ordered in the same way they will + be input to argparse. + + Parameters + ---------- + argdict : dict + Dictionary of arguments returned from defaultsDictForFunction. + reqlist : list + List of required arguments, in order that they should be passed to + argparse. + + Returns + ------- + req_arglist : list + List of required argument values for given function, in order for + passing to argparse + + Exceptions + ---------- + Argument value in argdict for required arg is None + + + + """ + req_arglist = [] + for arg in reqlist: + argval = argdict[arg] + if argval is None: + raise Exception("Value for required argument %s cannot be None" + % argval) + req_arglist.append(argdict[arg]) + return req_arglist + + +def checkRequired(required_args, defaults): + for req in required_args: + if req in defaults and req[defaults] is not None: + raise Exception(('Required argument %s has value present ' + ' in config file (%s) and command line' % + (req, defaults[req]))) + + +def getArgsWithConfig(parser, sys_args, required_args, func_name): + config_name, req_included = getConfigFilename(sys_args) + if config_name is not None: + defaults = defaultsDictForFunction(func_name, config_name) + if not req_included: + checkRequired(required_args, defaults) + parser.set_defaults(**defaults) + if not req_included: + return parser.parse_args(sys_args) + else: + req_args = makeRequiredList(defaults, required_args) + return parser.parse_args(req_args) |
| b |
| diff -r 000000000000 -r 1ad9c146ebb4 ppp_model.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ppp_model.py Fri Oct 19 14:36:02 2018 -0400 |
| [ |
| @@ -0,0 +1,59 @@ +from __future__ import print_function + +import gzip +import logging +import os +import shutil +import subprocess +import tempfile +import json + +import pysam +from bx.seq.twobit import TWOBIT_MAGIC_NUMBER, TWOBIT_MAGIC_NUMBER_SWAP, TWOBIT_MAGIC_SIZE + +from galaxy.datatypes import metadata +from galaxy.datatypes.metadata import DictParameter, ListParameter, MetadataElement, MetadataParameter + +from binary import Binary +from text import Json + +log = logging.getLogger(__name__) + +class Model(Json): + file_ext = ".model" + #edam_format = "format_3746" + + MetadataElement(name="models", default=[], desc="Models", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=[]) + MetadataElement(name="npop_dict", default={}, desc="Model population count", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value={}) + + # Update as format becomes more defined + def set_peek(self, dataset, is_multi_byte=False): + if not dataset.dataset.purged: + dataset.blurb = "Model File" + + # Update as format becomes more defined + ''' + def sniff(self, filename): + try: + json.loads(filename.file_name) + return True + except Exception: + return False + ''' + + def set_meta(self, dataset, **kwd): + if dataset.has_data(): + with open(dataset.file_name) as model_file: + try: + model_list = json.load(model_file) + except Exception: + return + + # Assign models + models = [model_dict['name'] for model_dict in model_list if 'name' in model_dict] + + # Assign population count per model + npop_dict = dict([(model_dict['name'], len(model_dict['pops'])) for model_dict in model_list if 'name' in model_dict]) + + dataset.metadata.models = models + dataset.metadata.npop_dict = npop_dict |
| b |
| diff -r 000000000000 -r 1ad9c146ebb4 vcf_reader_func.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf_reader_func.py Fri Oct 19 14:36:02 2018 -0400 |
| [ |
| b'@@ -0,0 +1,480 @@\n+import sys\n+import pysam\n+import logging\n+import struct\n+from random import sample\n+from collections import defaultdict\n+import os\n+import gzip\n+\n+def checkIfGzip(filename):\n+ try:\n+ gf = gzip.open(filename)\n+ gl = gf.readline()\n+ gf.close()\n+ vcf_check = b\'##fileformat=VCF\'\n+ if gl[0:3] == b\'BCF\':\n+ return \'bcf\'\n+ elif gl[:len(vcf_check)] == vcf_check:\n+ return checkHeader(filename)\n+ else:\n+ return \'other\'\n+ except:\n+ return \'nozip\'\n+\n+def checkHeader(filename):\n+ f = open(filename,\'rb\')\n+ l = f.readline()\n+ f.close()\n+ BGZF_HEADER=b\'\\x1f\\x8b\\x08\\x04\\x00\\x00\\x00\\x00\\x00\\xff\\x06\\x00\\x42\\x43\\x02\\x00\'\n+ #BGZF_HEADER=b\'\\x1f\\x8b\\x08\\x04\\x00\\x00\\x00\\x00\\x00\\xff\'\n+ GZF_HEADER=b\'\\x1f\\x8b\'\n+ if l[:len(BGZF_HEADER)] == BGZF_HEADER:\n+ return \'bgzip\'\n+ if l[:len(GZF_HEADER)] == GZF_HEADER:\n+ return \'gzip\'\n+ return \'nozip\'\n+\n+def checkFormat(vcfname):\n+ """Checks header of given file for compression type\n+\n+\n+ Given a filename, opens file and reads first line to check if\n+ file has BGZF or GZIP header. May be extended to check for BCF format\n+\n+ Parameters\n+ ----------\n+ filename : str\n+ Name of file to be checked\n+\n+ Returns\n+ -------\n+ extension : str {\'bgzip\',\'gzip\',\'vcf\',\'other\'}\n+ File extension as indicated by header\n+\n+ """\n+ typ = checkIfGzip(vcfname)\n+ if typ != \'nozip\':\n+ return typ\n+ f = open(vcfname)\n+ l = f.readline()\n+ f.close()\n+ VCF_TAG=\'##fileformat=VCF\'\n+ if l[:len(VCF_TAG)] == VCF_TAG:\n+ return \'vcf\'\n+ return \'other\'\n+\n+def checkIfCpG(record,fasta_ref,offset=0,add_chr=False):\n+ dr = None\n+ pos = record.pos\n+ c = record.chrom\n+ if record.alts is None:\n+ return False\n+ if add_chr:\n+ c = \'chr\'+record.chrom\n+ if record.ref == \'C\' and \'T\' in record.alts:\n+ seq = fasta_ref.fetch(c,pos-1,pos+1)\n+ if seq[0].upper() != \'C\':\n+ logging.warning(\'%s %d has bad base %s\' % (record.chrom,record.pos,seq[0]))\n+ #raise Exception("checkIfCpG function not lining up properly")\n+ if seq[1].upper() == \'G\':\n+ return True\n+ return False\n+ elif record.ref == \'G\' and \'A\' in record.alts:\n+ seq = fasta_ref.fetch(c,pos-2,pos)\n+ if seq[1].upper() != \'G\':\n+ logging.warning(\'%s %d has bad base %s\' % (record.chrom,record.pos,seq[1]))\n+ #raise Exception("checkIfCpg function not lining up on negative strand")\n+ if seq[0].upper() == \'C\':\n+ return True\n+ return False\n+ return False\n+\n+def checkForDuplicates(rec_list,pass_list):\n+ for i in range(len(rec_list)-1):\n+ if rec_list[i].pos == rec_list[i+1].pos:\n+ pass_list[i] = False\n+ pass_list[i+1] = False\n+\n+def checkForMultiallele(rec_list,pass_list):\n+ for i in range(len(rec_list)):\n+ if i != len(rec_list)-1 and rec_list[i].pos == rec_list[i+1].pos:\n+ pass_list[i] = False\n+ pass_list[i+1] = False\n+ if len(rec_list[i].alleles) > 2:\n+ pass_list[i] = False\n+\n+def flipChrom(chrom):\n+ if chrom[0:3] == \'chr\':\n+ return chrom[0:3]\n+ return \'chr\'+chrom\n+\n+def getAlleleCountDict(rec):\n+ alleles = defaultdict(int)\n+ total_sites = 0\n+ missing_inds = 0\n+ for j in range(len(rec.samples)):\n+ samp = rec.samples[j]\n+ if None in samp.alleles:\n+ missing_inds += 1\n+ for k in range(len(samp.alleles)):\n+ b = samp.alleles[k]\n+ if b is not None:\n+ alleles[b] += 1\n+ total_sites+=1\n+ return alleles, total_sites, missing_inds\n+\n+def isInvariant(rec):\n+ alleles, total_sites, missing_inds = getAlleleCountDict(rec)\n+ if len(alleles) == 1:\n+ return True\n+ return False\n+\n+def isInformative(rec, mincount=2, alleles=None):\n+ count = 0\n+ if alle'..b'path to compressed VCF file\n+ """\n+ cvcfname = vcfname+".gz"\n+ pysam.tabix_compress(vcfname,cvcfname,force=forceflag)\n+ pysam.tabix_index(cvcfname,preset="vcf",force=True)\n+ if remove:\n+ os.remove(vcfname)\n+ return cvcfname\n+\n+def vcfRegionName(prefix, region, ext, oneidx=False,\n+ halfopen=True, sep=\'-\'):\n+ chrom = region.toStr(halfopen, oneidx, sep)\n+ return prefix+\'_\'+chrom+\'.\'+ext\n+\n+def getRecordsInRegion(region, record_list):\n+ sub_list = []\n+ for i in range(len(record_list)):\n+ loc = region.containsRecord(record_list[i])\n+ if loc == "in":\n+ sub_list.append(record_list[i])\n+ elif loc == "after":\n+ break\n+ return sub_list\n+\n+\n+\n+\n+\n+#def getVcfReader(args):\n+def getVcfReader(vcfname, compress_flag=False, subsamp_num=None,\n+ subsamp_fn=None, subsamp_list=None, index=None):\n+ """Returns a reader for a given input VCF file.\n+\n+ Given a filename, filetype, compression option, and optional Subsampling\n+ options, will return a pysam.VariantFile object for iteration and\n+ a flag as to whether this file is compressed or uncompressed.\n+\n+ Parameters\n+ ----------\n+ vcfname : str\n+ Filename for VCF file. The extension of this file will be used to\n+ determine whether it is compressed or not unless `var_ext` is set.\n+ var_ext : str (None)\n+ Extension for VCF file if it is not included in the filename.\n+ compress_flag : bool (False)\n+ If filetype is uncompressed and this is set to true, will run\n+ compressVcf function.\n+ subsamp_num : int (None)\n+ If set, will randomly select `subsamp_num` individuals (not\n+ genotypes) from the input VCF file and return a reader with\n+ only those data.\n+ subsamp_fn : str (None)\n+ If set, will return a reader with only data from the samples listed\n+ in the file provided. Cannot be used with other subsampling options.\n+ subsamp_list : list (None)\n+ If set, will return reader with records containing only\n+ individuals named in the list. Cannot be used with other subsampling\n+ options.\n+\n+ Returns\n+ -------\n+ vcf_reader : pysam.VariantFile\n+ A reader that can be iterated through for variant records. If\n+ compressed, it will be able to use the pysam fetch method, otherwise\n+ it must be read through sequentially\n+ reader_uncompressed : bool\n+ If True, VCF reader is uncompressed. This means the fetch method\n+ cannot be used and region access must be done using the\n+ "getRecordListUnzipped" method.\n+\n+ """\n+ ext = checkFormat(vcfname)\n+ if ext in [\'gzip\',\'other\'] :\n+ raise Exception((\'Input file %s is gzip-formatted, must be either \'\n+ \'uncompressed or zipped with bgzip\' % vcfname))\n+ file_uncompressed = (ext == \'vcf\')\n+ reader_uncompressed = (file_uncompressed and not compress_flag)\n+ if compress_flag and file_uncompressed:\n+ vcfname = compressVcf(vcfname)\n+ #subsamp_list = None\n+ if subsamp_num is not None:\n+ if subsamp_list is not None:\n+ raise Exception(\'Multiple subsampling options called in getVcfReader\')\n+ subsamp_list = getSubsampleList(vcfname, subsamp_num)\n+ elif subsamp_fn is not None:\n+ if subsamp_list is not None:\n+ raise Exception(\'Multiple subsampling options called in getVcfReader\')\n+ subsamp_file = open(subsamp_fn,\'r\')\n+ subsamp_list = [l.strip() for l in subsamp_file.readlines()]\n+ subsamp_file.close()\n+ if index is None:\n+ vcf_reader = pysam.VariantFile(vcfname)\n+ else:\n+ vcf_reader = pysam.VariantFile(vcfname, index_filename=index)\n+ if subsamp_list is not None:\n+ logging.debug(\'Subsampling %d individuals from VCF file\' %\n+ (len(subsamp_list)))\n+ vcf_reader.subset_samples(subsamp_list)\n+ return vcf_reader, reader_uncompressed\n' |
| b |
| diff -r 000000000000 -r 1ad9c146ebb4 vcf_to_ima.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf_to_ima.py Fri Oct 19 14:36:02 2018 -0400 |
| [ |
| b'@@ -0,0 +1,426 @@\n+import sys\n+import pysam\n+import argparse\n+import os\n+import logging\n+from logging_module import initLogger, logArgs\n+from random import sample\n+from collections import OrderedDict\n+from gene_region import Region, RegionList\n+import vcf_reader_func as vf\n+from parse_functions import defaultsDictForFunction, getConfigFilename, makeRequiredList, getArgsWithConfig\n+\n+#from tabix_wrapper import prepVcf\n+\n+#Input: VCF file, reference sequence, region list (possibly .bed file)\n+#Output: Sequences with reference genome overlayed with VCF SNP calls\n+\n+sys.path.insert(0,os.path.abspath(os.path.join(os.pardir, \'andrew\')))\n+\n+from model import Model, read_model_file\n+\n+def createParser():\n+ parser = argparse.ArgumentParser(description=("Generates an IMa input "\n+ "file from a VCF file, a reference"\n+ " genome, a list of gene regions, "\n+ "and a population info file."),\n+ fromfile_prefix_chars="@")\n+ parser.add_argument("--vcf", dest="vcfname", help="Input VCF filename")\n+ parser.add_argument("--vcfs", dest="vcflist", nargs="+")\n+ parser.add_argument("--ref", dest="refname", help="Reference FASTA file")\n+ parser.add_argument("--bed", dest="genename", help="Name of gene region file")\n+ parser.add_argument("--pop", dest="popname", help=("Filename of pop "\n+ "model file"))\n+ parser.add_argument("--zero-ho", dest="zeroho", action="store_true")\n+ parser.add_argument("--zero-closed", dest="zeroclosed", action="store_true")\n+ parser.add_argument("--keep-indels", dest="indel_flag", action="store_true",\n+ help="Include indels when reporting sequences")\n+ parser.add_argument("--remove-multiallele",dest="remove_multiallele",action="store_true")\n+ parser.add_argument("--remove-missing", dest="remove_missing", default=-1, help=("Will filter out site if more than the given number of individuals (not genotypes) are missing data. 0 removes sites with any missing data, -1 (default) removes nothing"),type=int)\n+ parser.add_argument("--parsecpg",dest="parsecpg",action="store_true")\n+ parser.add_argument("--noseq",dest="printseq",action="store_false",help="Will only print variants when reference is provided. Used so CpG filtering can be done if invariant sites aren\'t desired in output")\n+ parser.add_argument("--trim-to-ref-length", dest="trim_seq",\n+ action="store_true",\n+ help=("Trims sequences if indels cause them to be "\n+ "longer than reference"))\n+ parser.add_argument("--output", dest="output_name", default="input.ima.u",\n+ help= ("Optional name for output other than default"))\n+ parser.add_argument("--gene-col", dest="gene_col", help= (\n+ "Comma-separated list of columns for gene region "\n+ " data, format is start/end if no chromosome "\n+ " data, start/end/chrom if so"))\n+ parser.add_argument("--compress-vcf", dest="compress_flag",\n+ action="store_true", help=("If input VCF is not "\n+ "compressed, will compress and use zip search"))\n+ parser.add_argument("--conf", dest="config_name", help= ("Name of "\n+ "file with configuration options"))\n+ parser.add_argument("--no-ref-check", dest="ref_check",\n+ action="store_false", help=("Prevents exception "\n+ "generated by mismatched reference alleles from "\n+ "VCF file compared to reference"))\n+ parser.add_argument("--poptag",dest="poptag",help=("If model file has "\n+ "multiple models, use model with this name"))\n+ parser.add_argument("--mutrate",dest="mutrate",type=float,default=1e-9,help="Mutation rate per base pair (def'..b'\n+ region_list = RegionList(filename=args.genename, zeroho=args.zeroho,\n+ zeroclosed=args.zeroclosed,\n+ colstr=args.gene_col)\n+ logging.info(\'Region list read\')\n+ fasta_ref = None\n+ if args.refname is not None:\n+ fasta_ref = pysam.FastaFile(args.refname)\n+ record_count = 1\n+ first_el = vcf_reader.prev_last_rec\n+\n+ logging.info(\'Total individuals: %d\' % (len(vcf_reader.prev_last_rec.samples)))\n+ if regions_provided:\n+ logging.info(\'Total regions: %d\' % (len(region_list.regions)))\n+ total_regions = (len(region_list.regions) if regions_provided else len(args.vcflist))\n+ if not args.fasta:\n+ writeHeader(popmodel, total_regions, output_file)\n+ if not single_file:\n+ vcf_reader.reader.close()\n+ for i in range(total_regions):\n+ #if regions_provided:\n+ #region = region_list.regions[i]\n+ if args.multi_out is not None:\n+ try:\n+ output_file.close()\n+ except:\n+ pass\n+ output_filename = next(outnamegen)\n+ output_file = open(output_filename,\'w\')\n+ if single_file:\n+ region = region_list.regions[i]\n+ rec_list = vcf_reader.getRecordList(region)\n+ else:\n+ vcf_reader = vf.VcfReader(args.vcflist[i],\n+ compress_flag=args.compress_flag,\n+ popmodel=popmodel,\n+ use_allpop=use_allpop)\n+ rec_list = vcf_reader.getRecordList()\n+ vcf_reader.reader.close()\n+ if regions_provided:\n+ region = region_list.regions[i]\n+ else:\n+ region = Region(rec_list[0].pos-1,rec_list[-1].pos,rec_list[0].chrom)\n+ if filter_recs:\n+ t = vf.filterSites(rec_list,remove_cpg=args.parsecpg,remove_indels=(not args.indel_flag),remove_multiallele=args.remove_multiallele,remove_missing=args.remove_missing,inform_level=1,fasta_ref=fasta_ref)\n+ rec_list = t\n+ if len(rec_list) == 0:\n+ logging.warning(("Region %s has no variants "\n+ "in VCF file") % (region.toStr()))\n+ logging.debug(\'Region %d to %d: %d variants\' %\n+ (region.start,region.end,len(rec_list)))\n+ ref_seq = None\n+ if fasta_ref is not None:\n+ try:\n+ ref_seq = fasta_ref.fetch(region.chrom, region.start, region.end)\n+ except KeyError:\n+ ref_seq = fasta_ref.fetch(vf.flipChrom(region.chrom),region.start,region.end)\n+ checkRefAlign(rec_list, fasta_ref, region.chrom, args.ref_check)\n+ if not args.fasta:\n+ reg_header = getLocusHeader(region, popmodel, rec_list,mut_rate=args.mutrate,inhet_sc=args.inhet_sc,include_seq=(fasta_ref is not None))\n+ output_file.write(reg_header+\'\\n\')\n+ popnum, indiv = 0, 0\n+ #for p in popmodel.pop_list:\n+ for p in vcf_reader.popkeys.keys():\n+ for indiv_idx in vcf_reader.popkeys[p]:\n+ for hap in range(len(first_el.samples[indiv_idx].alleles)):\n+ if args.fasta:\n+ output_file.write(\'>\'+first_el.samples[indiv_idx].name+\'_\'+str(hap)+\'\\n\')\n+ seq = generateSequence(rec_list, ref_seq,\n+ region, region.chrom, indiv_idx, hap, args)\n+ if not args.fasta:\n+ seq_name = str(popnum)+\':\'+str(indiv)+\':\'+str(hap)\n+ seq_name += \'\'.join([\' \' for i in range(len(seq_name),10)])\n+ output_file.write(seq_name)\n+\n+ output_file.write(seq+\'\\n\')\n+ indiv += 1\n+ popnum += 1\n+ indiv = 0\n+ record_count += 1\n+ output_file.close()\n+\n+if __name__ == "__main__":\n+ initLogger()\n+ vcf_to_ima(sys.argv[1:])\n' |