Mercurial > repos > ulfschaefer > vcfs2fasta
diff phe/variant/GATKVariantCaller.py @ 14:f72039c5faa4 draft
Uploaded
| author | ulfschaefer | 
|---|---|
| date | Wed, 16 Dec 2015 07:29:05 -0500 | 
| parents | |
| children | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phe/variant/GATKVariantCaller.py Wed Dec 16 07:29:05 2015 -0500 @@ -0,0 +1,126 @@ +''' +Created on 22 Sep 2015 + +@author: alex +''' +from collections import OrderedDict +import logging +import os +import subprocess + +from phe.variant import VariantCaller + + +class GATKVariantCaller(VariantCaller): + """Implemetation of the Broad institute's variant caller.""" + + name = "gatk" + """Plain text name of the variant caller.""" + + _default_options = "--sample_ploidy 2 --genotype_likelihoods_model BOTH -rf BadCigar -out_mode EMIT_ALL_SITES -nt 1" + """Default options for the variant caller.""" + + def __init__(self, cmd_options=None): + """Constructor""" + if cmd_options is None: + cmd_options = self._default_options + + super(GATKVariantCaller, self).__init__(cmd_options=cmd_options) + + self.last_command = None + + def get_info(self, plain=False): + d = {"name": "gatk", "version": self.get_version(), "command": self.last_command} + + if plain: + result = "GATK(%(version)s): %(command)s" % d + else: + result = OrderedDict(d) + + return result + + def get_version(self): + + p = subprocess.Popen(["java", "-jar", os.environ["GATK_JAR"], "-version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + (output, _) = p.communicate() + + # last character is EOL. + version = output.split("\n")[-2] + + return version + + def make_vcf(self, *args, **kwargs): + ref = kwargs.get("ref") + bam = kwargs.get("bam") + + if kwargs.get("vcf_file") is None: + kwargs["vcf_file"] = "variants.vcf" + + opts = {"ref": os.path.abspath(ref), + "bam": os.path.abspath(bam), + "gatk_jar": os.environ["GATK_JAR"], + "all_variants_file": os.path.abspath(kwargs.get("vcf_file")), + "extra_cmd_options": self.cmd_options} + +# if not self.create_aux_files(ref): +# logging.warn("Auxiliary files were not created.") +# return False + + # Call variants + # FIXME: Sample ploidy = 2? + os.environ["GATK_JAR"] + cmd = "java -XX:+UseSerialGC -jar %(gatk_jar)s -T UnifiedGenotyper -R %(ref)s -I %(bam)s -o %(all_variants_file)s %(extra_cmd_options)s" % opts + success = os.system(cmd) + + if success != 0: + logging.warn("Calling variants returned non-zero exit status.") + return False + + self.last_command = cmd + + return True + + def create_aux_files(self, ref): + """Create auxiliary files needed for this variant. + + Tools needed: samtools and picard tools. Picard tools is a Java + library that can be defined using environment variable: PICARD_JAR + specifying path to picard.jar or PICARD_TOOLS_PATH specifying path + to the directory where separate jars are (older version before jars + were merged into a single picard.jar). + Parameters: + ----------- + ref: str + Path to the reference file. + + Returns: + -------- + bool: + True if auxiliary files were created, False otherwise. + """ + + ref_name, _ = os.path.splitext(ref) + + success = os.system("samtools faidx %s" % ref) + + if success != 0: + logging.warn("Fasta index could not be created.") + return False + + d = {"ref": ref, "ref_name": ref_name} + + if os.environ.get("PICARD_TOOLS_PATH"): + d["picard_tools_path"] = os.path.join(os.environ["PICARD_TOOLS_PATH"], "CreateSequenceDictionary.jar") + elif os.environ.get("PICARD_JAR"): + # This is used in newer version of PICARD tool where multiple + # jars were merged into a single jar file. + d["picard_tools_path"] = "%s %s" % (os.environ["PICARD_JAR"], "CreateSequenceDictionary") + else: + logging.error("Picard tools are not present in the path.") + return False + + success = os.system("java -jar %(picard_tools_path)s R=%(ref)s O=%(ref_name)s.dict" % d) + + if success != 0: + logging.warn("Dictionary for the %s reference could not be created", ref) + return False
