comparison phe/variant/GATKVariantCaller.py @ 14:f72039c5faa4 draft

Uploaded
author ulfschaefer
date Wed, 16 Dec 2015 07:29:05 -0500
parents
children
comparison
equal deleted inserted replaced
13:2e69ce9dca65 14:f72039c5faa4
1 '''
2 Created on 22 Sep 2015
3
4 @author: alex
5 '''
6 from collections import OrderedDict
7 import logging
8 import os
9 import subprocess
10
11 from phe.variant import VariantCaller
12
13
14 class GATKVariantCaller(VariantCaller):
15 """Implemetation of the Broad institute's variant caller."""
16
17 name = "gatk"
18 """Plain text name of the variant caller."""
19
20 _default_options = "--sample_ploidy 2 --genotype_likelihoods_model BOTH -rf BadCigar -out_mode EMIT_ALL_SITES -nt 1"
21 """Default options for the variant caller."""
22
23 def __init__(self, cmd_options=None):
24 """Constructor"""
25 if cmd_options is None:
26 cmd_options = self._default_options
27
28 super(GATKVariantCaller, self).__init__(cmd_options=cmd_options)
29
30 self.last_command = None
31
32 def get_info(self, plain=False):
33 d = {"name": "gatk", "version": self.get_version(), "command": self.last_command}
34
35 if plain:
36 result = "GATK(%(version)s): %(command)s" % d
37 else:
38 result = OrderedDict(d)
39
40 return result
41
42 def get_version(self):
43
44 p = subprocess.Popen(["java", "-jar", os.environ["GATK_JAR"], "-version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
45 (output, _) = p.communicate()
46
47 # last character is EOL.
48 version = output.split("\n")[-2]
49
50 return version
51
52 def make_vcf(self, *args, **kwargs):
53 ref = kwargs.get("ref")
54 bam = kwargs.get("bam")
55
56 if kwargs.get("vcf_file") is None:
57 kwargs["vcf_file"] = "variants.vcf"
58
59 opts = {"ref": os.path.abspath(ref),
60 "bam": os.path.abspath(bam),
61 "gatk_jar": os.environ["GATK_JAR"],
62 "all_variants_file": os.path.abspath(kwargs.get("vcf_file")),
63 "extra_cmd_options": self.cmd_options}
64
65 # if not self.create_aux_files(ref):
66 # logging.warn("Auxiliary files were not created.")
67 # return False
68
69 # Call variants
70 # FIXME: Sample ploidy = 2?
71 os.environ["GATK_JAR"]
72 cmd = "java -XX:+UseSerialGC -jar %(gatk_jar)s -T UnifiedGenotyper -R %(ref)s -I %(bam)s -o %(all_variants_file)s %(extra_cmd_options)s" % opts
73 success = os.system(cmd)
74
75 if success != 0:
76 logging.warn("Calling variants returned non-zero exit status.")
77 return False
78
79 self.last_command = cmd
80
81 return True
82
83 def create_aux_files(self, ref):
84 """Create auxiliary files needed for this variant.
85
86 Tools needed: samtools and picard tools. Picard tools is a Java
87 library that can be defined using environment variable: PICARD_JAR
88 specifying path to picard.jar or PICARD_TOOLS_PATH specifying path
89 to the directory where separate jars are (older version before jars
90 were merged into a single picard.jar).
91 Parameters:
92 -----------
93 ref: str
94 Path to the reference file.
95
96 Returns:
97 --------
98 bool:
99 True if auxiliary files were created, False otherwise.
100 """
101
102 ref_name, _ = os.path.splitext(ref)
103
104 success = os.system("samtools faidx %s" % ref)
105
106 if success != 0:
107 logging.warn("Fasta index could not be created.")
108 return False
109
110 d = {"ref": ref, "ref_name": ref_name}
111
112 if os.environ.get("PICARD_TOOLS_PATH"):
113 d["picard_tools_path"] = os.path.join(os.environ["PICARD_TOOLS_PATH"], "CreateSequenceDictionary.jar")
114 elif os.environ.get("PICARD_JAR"):
115 # This is used in newer version of PICARD tool where multiple
116 # jars were merged into a single jar file.
117 d["picard_tools_path"] = "%s %s" % (os.environ["PICARD_JAR"], "CreateSequenceDictionary")
118 else:
119 logging.error("Picard tools are not present in the path.")
120 return False
121
122 success = os.system("java -jar %(picard_tools_path)s R=%(ref)s O=%(ref_name)s.dict" % d)
123
124 if success != 0:
125 logging.warn("Dictionary for the %s reference could not be created", ref)
126 return False