annotate phe/variant/GATKVariantCaller.py @ 11:cd59be4a7fe3 draft default tip

Uploaded
author ulfschaefer
date Mon, 21 Dec 2015 11:12:19 -0500
parents c2f8e7580133
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
1 '''
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
2 Created on 22 Sep 2015
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
3
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
4 @author: alex
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
5 '''
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
6 from collections import OrderedDict
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
7 import logging
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
8 import os
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
9 import subprocess
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
10
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
11 from phe.variant import VariantCaller
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
12
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
13
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
14 class GATKVariantCaller(VariantCaller):
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
15 """Implemetation of the Broad institute's variant caller."""
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
16
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
17 name = "gatk"
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
18 """Plain text name of the variant caller."""
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
19
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
20 _default_options = "--sample_ploidy 2 --genotype_likelihoods_model BOTH -rf BadCigar -out_mode EMIT_ALL_SITES -nt 1"
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
21 """Default options for the variant caller."""
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
22
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
23 def __init__(self, cmd_options=None):
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
24 """Constructor"""
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
25 if cmd_options is None:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
26 cmd_options = self._default_options
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
27
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
28 super(GATKVariantCaller, self).__init__(cmd_options=cmd_options)
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
29
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
30 self.last_command = None
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
31
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
32 def get_info(self, plain=False):
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
33 d = {"name": "gatk", "version": self.get_version(), "command": self.last_command}
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
34
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
35 if plain:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
36 result = "GATK(%(version)s): %(command)s" % d
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
37 else:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
38 result = OrderedDict(d)
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
39
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
40 return result
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
41
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
42 def get_version(self):
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
43
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
44 p = subprocess.Popen(["java", "-jar", os.environ["GATK_JAR"], "-version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
45 (output, _) = p.communicate()
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
46
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
47 # last character is EOL.
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
48 version = output.split("\n")[-2]
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
49
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
50 return version
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
51
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
52 def make_vcf(self, *args, **kwargs):
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
53 ref = kwargs.get("ref")
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
54 bam = kwargs.get("bam")
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
55
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
56 make_aux = kwargs.get("make_aux", False)
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
57
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
58 if kwargs.get("vcf_file") is None:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
59 kwargs["vcf_file"] = "variants.vcf"
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
60
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
61 opts = {"ref": os.path.abspath(ref),
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
62 "bam": os.path.abspath(bam),
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
63 "gatk_jar": os.environ["GATK_JAR"],
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
64 "all_variants_file": os.path.abspath(kwargs.get("vcf_file")),
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
65 "extra_cmd_options": self.cmd_options}
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
66 if make_aux:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
67 if not self.create_aux_files(ref):
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
68 logging.warn("Auxiliary files were not created.")
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
69 return False
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
70
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
71 # Call variants
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
72 # FIXME: Sample ploidy = 2?
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
73 os.environ["GATK_JAR"]
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
74 cmd = "java -XX:+UseSerialGC -jar %(gatk_jar)s -T UnifiedGenotyper -R %(ref)s -I %(bam)s -o %(all_variants_file)s %(extra_cmd_options)s" % opts
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
75 success = os.system(cmd)
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
76
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
77 if success != 0:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
78 logging.warn("Calling variants returned non-zero exit status.")
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
79 return False
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
80
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
81 self.last_command = cmd
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
82
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
83 return True
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
84
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
85 def create_aux_files(self, ref):
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
86 """Create auxiliary files needed for this variant.
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
87
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
88 Tools needed: samtools and picard tools. Picard tools is a Java
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
89 library that can be defined using environment variable: PICARD_JAR
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
90 specifying path to picard.jar or PICARD_TOOLS_PATH specifying path
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
91 to the directory where separate jars are (older version before jars
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
92 were merged into a single picard.jar).
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
93 Parameters:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
94 -----------
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
95 ref: str
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
96 Path to the reference file.
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
97
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
98 Returns:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
99 --------
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
100 bool:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
101 True if auxiliary files were created, False otherwise.
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
102 """
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
103
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
104 ref_name, _ = os.path.splitext(ref)
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
105
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
106 success = os.system("samtools faidx %s" % ref)
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
107
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
108 if success != 0:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
109 logging.warn("Fasta index could not be created.")
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
110 return False
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
111
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
112 d = {"ref": ref, "ref_name": ref_name}
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
113
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
114 if os.environ.get("PICARD_TOOLS_PATH"):
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
115 d["picard_tools_path"] = os.path.join(os.environ["PICARD_TOOLS_PATH"], "CreateSequenceDictionary.jar")
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
116 elif os.environ.get("PICARD_JAR"):
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
117 # This is used in newer version of PICARD tool where multiple
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
118 # jars were merged into a single jar file.
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
119 d["picard_tools_path"] = "%s %s" % (os.environ["PICARD_JAR"], "CreateSequenceDictionary")
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
120 else:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
121 logging.error("Picard tools are not present in the path.")
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
122 return False
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
123
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
124 if not os.path.exists("%s.dict" % ref_name):
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
125 success = os.system("java -jar %(picard_tools_path)s R=%(ref)s O=%(ref_name)s.dict" % d)
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
126
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
127 if success != 0:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
128 logging.warn("Dictionary for the %s reference could not be created", ref)
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
129 return False
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
130 else:
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
131 logging.debug("PICARD AUX EXISTS.")
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
132
c2f8e7580133 Uploaded
ulfschaefer
parents:
diff changeset
133 return True