comparison defuse_results_to_vcf.py @ 11:b22f8634ff84 draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/defuse commit 23b94b5747c6956360cd2eca0a07a669929ea141-dirty
author jjohnson
date Sun, 17 Jan 2016 14:11:06 -0500
parents
children
comparison
equal deleted inserted replaced
10:f65857c1b92e 11:b22f8634ff84
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2012, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 # Jesse Erdmann
12 #
13 #------------------------------------------------------------------------------
14 """
15
16
17 """
18 This tool takes the defuse results.tsv tab-delimited file as input and creates a Variant Call Format file as output.
19 """
20
21 import sys,re,os.path
22 import optparse
23 from optparse import OptionParser
24
25 """
26 http://www.1000genomes.org/wiki/analysis/variant-call-format/vcf-variant-call-format-version-42
27
28 5. INFO keys used for structural variants
29 When the INFO keys reserved for encoding structural variants are used for imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles).
30 The following INFO keys are reserved for encoding structural variants. In general, when these keys are used by imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles).
31 ##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
32 ##INFO=<ID=NOVEL,Number=0,Type=Flag,Description="Indicates a novel structural variation">
33 ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
34 For precise variants, END is POS + length of REF allele - 1, and the for imprecise variants the corresponding best estimate.
35 ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
36 Value should be one of DEL, INS, DUP, INV, CNV, BND. This key can be derived from the REF/ALT fields but is useful for filtering.
37 ##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
38 One value for each ALT allele. Longer ALT alleles (e.g. insertions) have positive values, shorter ALT alleles (e.g. deletions) have negative values.
39 ##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
40 ##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
41 ##INFO=<ID=HOMLEN,Number=.,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints">
42 ##INFO=<ID=HOMSEQ,Number=.,Type=String,Description="Sequence of base pair identical micro-homology at event breakpoints">
43 ##INFO=<ID=BKPTID,Number=.,Type=String,Description="ID of the assembled alternate allele in the assembly file">
44 For precise variants, the consensus sequence the alternate allele assembly is derivable from the REF and ALT fields. However, the alternate allele assembly file may contain additional information about the characteristics of the alt allele contigs.
45 ##INFO=<ID=MEINFO,Number=4,Type=String,Description="Mobile element info of the form NAME,START,END,POLARITY">
46 ##INFO=<ID=METRANS,Number=4,Type=String,Description="Mobile element transduction info of the form CHR,START,END,POLARITY">
47 ##INFO=<ID=DGVID,Number=1,Type=String,Description="ID of this element in Database of Genomic Variation">
48 ##INFO=<ID=DBVARID,Number=1,Type=String,Description="ID of this element in DBVAR">
49 ##INFO=<ID=DBRIPID,Number=1,Type=String,Description="ID of this element in DBRIP">
50 ##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">
51 ##INFO=<ID=PARID,Number=1,Type=String,Description="ID of partner breakend">
52 ##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">
53 ##INFO=<ID=CILEN,Number=2,Type=Integer,Description="Confidence interval around the length of the inserted material between breakends">
54 ##INFO=<ID=DP,Number=1,Type=Integer,Description="Read Depth of segment containing breakend">
55 ##INFO=<ID=DPADJ,Number=.,Type=Integer,Description="Read Depth of adjacency">
56 ##INFO=<ID=CN,Number=1,Type=Integer,Description="Copy number of segment containing breakend">
57 ##INFO=<ID=CNADJ,Number=.,Type=Integer,Description="Copy number of adjacency">
58 ##INFO=<ID=CICN,Number=2,Type=Integer,Description="Confidence interval around copy number for the segment">
59 ##INFO=<ID=CICNADJ,Number=.,Type=Integer,Description="Confidence interval around copy number for the adjacency">
60 6. FORMAT keys used for structural variants
61 ##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">
62 ##FORMAT=<ID=CNQ,Number=1,Type=Float,Description="Copy number genotype quality for imprecise events">
63 ##FORMAT=<ID=CNL,Number=.,Type=Float,Description="Copy number genotype likelihood for imprecise events">
64 ##FORMAT=<ID=NQ,Number=1,Type=Integer,Description="Phred style probability score that the variant is novel with respect to the genome's ancestor">
65 ##FORMAT=<ID=HAP,Number=1,Type=Integer,Description="Unique haplotype identifier">
66 ##FORMAT=<ID=AHAP,Number=1,Type=Integer,Description="Unique identifier of ancestral haplotype">
67 These keys are analogous to GT/GQ/GL and are provided for genotyping imprecise events by copy number (either because there is an unknown number of alternate alleles or because the haplotypes cannot be determined). CN specifies the integer copy number of the variant in this sample. CNQ is encoded as a phred quality -10log_10p(copy number genotype call is wrong). CNL specifies a list of log10 likelihoods for each potential copy number, starting from zero. When possible, GT/GQ/GL should be used instead of (or in addition to) these keys.
68
69 Specifying Complex Rearrangements with Breakends
70 An arbitrary rearrangement event can be summarized as a set of novel adjacencies.
71 Each adjacency ties together 2 breakends. The two breakends at either end of a novel adjacency are called mates.
72 There is one line of VCF (i.e. one record) for each of the two breakends in a novel adjacency. A breakend record is identified with the tag SYTYPE=BND" in the INFO field. The REF field of a breakend record indicates a base or sequence s of bases beginning at position POS, as in all VCF records. The ALT field of a breakend record indicates a replacement for s. This "breakend replacement" has three parts:
73 the string t that replaces places s. The string t may be an extended version of s if some novel bases are inserted during the formation of the novel adjacency.
74 The position p of the mate breakend, indicated by a string of the form "chr:pos". This is the location of the first mapped base in the piece being joined at this novel adjacency.
75 The direction that the joined sequence continues in, starting from p. This is indicated by the orientation of square brackets surrounding p.
76 These 3 elements are combined in 4 possible ways to create the ALT. In each of the 4 cases, the assertion is that s is replaced with t, and then some piece starting at position p is joined to t. The cases are:
77 REF ALT Meaning
78 s t[p[ piece extending to the right of p is joined after t
79 s t]p] reverse comp piece extending left of p is joined after t
80 s ]p]t piece extending to the left of p is joined before t
81 s [p[t reverse comp piece extending right of p is joined before t
82
83 Examples:
84 #CHROM POS ID REF ALT QUAL FILT INFO
85 2 321681 bnd_W G G]17:198982] 6 PASS SVTYPE=BND;MATEID=bnd_Y
86 2 321682 bnd_V T ]13:123456]T 6 PASS SVTYPE=BND;MATEID=bnd_U
87 13 123456 bnd_U C C[2:321682[ 6 PASS SVTYPE=BND;MATEID=bnd_V
88 13 123457 bnd_X A [17:198983[A 6 PASS SVTYPE=BND;MATEID=bnd_Z
89 17 198982 bnd_Y A A]2:321681] 6 PASS SVTYPE=BND;MATEID=bnd_W
90 17 198983 bnd_Z C [13:123457[C 6 PASS SVTYPE=BND;MATEID=bnd_X
91 """
92
93 vcf_header = """\
94 ##fileformat=VCFv4.1
95 ##source=defuse
96 ##reference=%s
97 ##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
98 ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
99 ##INFO=<ID=MATEID,Number=1,Type=String,Description="ID of the BND mate">
100 ##INFO=<ID=MATELOC,Number=1,Type=String,Description="The chrom:position of the BND mate">
101 ##INFO=<ID=GENESTRAND,Number=2,Type=String,Description="Strands">
102 ##INFO=<ID=DP,Number=1,Type=Integer,Description="Read Depth of segment containing breakend">
103 ##INFO=<ID=SPLITCNT,Number=1,Type=Integer,Description="number of split reads supporting the prediction">
104 ##INFO=<ID=SPANCNT,Number=1,Type=Integer,Description="number of spanning reads supporting the fusion">
105 ##INFO=<ID=HOMLEN,Number=1,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints">
106 ##INFO=<ID=SPLICESCORE,Number=1,Type=Integer,Description="number of nucleotides similar to GTAG at fusion splice">
107 ##INFO=<ID=GENE,Number=2,Type=String,Description="Gene Names at each breakend">
108 ##INFO=<ID=GENEID,Number=2,Type=String,Description="Gene IDs at each breakend">
109 ##INFO=<ID=GENELOC,Number=2,Type=String,Description="location of breakpoint releative to genes">
110 ##INFO=<ID=EXPR,Number=2,Type=Integer,Description="expression of genes as number of concordant pairs aligned to exons">
111 ##INFO=<ID=ORF,Number=0,Type=Flag,Description="fusion combines genes in a way that preserves a reading frame">
112 ##INFO=<ID=EXONBND,Number=0,Type=Flag,Description="fusion splice at exon boundaries">
113 ##INFO=<ID=INTERCHROM,Number=0,Type=Flag,Description="fusion produced by an interchromosomal translocation">
114 ##INFO=<ID=READTHROUGH,Number=0,Type=Flag,Description="fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement">
115 ##INFO=<ID=ADJACENT,Number=0,Type=Flag,Description="fusion between adjacent genes">
116 ##INFO=<ID=ALTSPLICE,Number=0,Type=Flag,Description="fusion likely the product of alternative splicing between adjacent genes">
117 ##INFO=<ID=DELETION,Number=0,Type=Flag,Description="fusion produced by a genomic deletion">
118 ##INFO=<ID=EVERSION,Number=0,Type=Flag,Description="fusion produced by a genomic eversion">
119 ##INFO=<ID=INVERSION,Number=0,Type=Flag,Description="fusion produced by a genomic inversion">
120 #CHROM POS ID REF ALT QUAL FILTER INFO\
121 """
122
123 def cmp_alphanumeric(s1,s2):
124 if s1 == s2:
125 return 0
126 a1 = re.findall("\d+|[a-zA-Z]+",s1)
127 a2 = re.findall("\d+|[a-zA-Z]+",s2)
128 for i in range(min(len(a1),len(a2))):
129 if a1[i] == a2[i]:
130 continue
131 if a1[i].isdigit() and a2[i].isdigit():
132 return int(a1[i]) - int(a2[i])
133 return 1 if a1[i] > a2[i] else -1
134 return len(a1) - len(a2)
135
136 def __main__():
137 # VCF functions
138 chr_dict = dict()
139 def add_vcf_line(chr,pos,id,line):
140 if chr not in chr_dict:
141 pos_dict = dict()
142 chr_dict[chr] = pos_dict
143 if pos not in chr_dict[chr]:
144 id_dict = dict()
145 chr_dict[chr][pos] = id_dict
146 chr_dict[chr][pos][id] = line
147
148 def write_vcf():
149 print >> outputFile, vcf_header % (refname)
150 for chr in sorted(chr_dict.keys(),cmp=cmp_alphanumeric):
151 for pos in sorted(chr_dict[chr].keys()):
152 for id in chr_dict[chr][pos]:
153 print >> outputFile, chr_dict[chr][pos][id]
154 #Parse Command Line
155 parser = optparse.OptionParser()
156 # files
157 parser.add_option( '-i', '--input', dest='input', help='The input defuse results.tsv file (else read from stdin)' )
158 parser.add_option( '-o', '--output', dest='output', help='The output vcf file (else write to stdout)' )
159 parser.add_option( '-r', '--reference', dest='reference', default=None, help='The genomic reference id' )
160 (options, args) = parser.parse_args()
161
162 # results.tsv input
163 if options.input != None:
164 try:
165 inputPath = os.path.abspath(options.input)
166 inputFile = open(inputPath, 'r')
167 except Exception, e:
168 print >> sys.stderr, "failed: %s" % e
169 exit(2)
170 else:
171 inputFile = sys.stdin
172 # vcf output
173 if options.output != None:
174 try:
175 outputPath = os.path.abspath(options.output)
176 outputFile = open(outputPath, 'w')
177 except Exception, e:
178 print >> sys.stderr, "failed: %s" % e
179 exit(3)
180 else:
181 outputFile = sys.stdout
182
183 refname = options.reference if options.reference else 'unknown'
184
185 svtype = 'SVTYPE=BND'
186 filt = 'PASS'
187 columns = []
188 try:
189 for linenum,line in enumerate(inputFile):
190 ## print >> sys.stderr, "%d: %s\n" % (linenum,line)
191 fields = line.strip().split('\t')
192 if line.startswith('cluster_id'):
193 columns = fields
194 ## print >> sys.stderr, "columns: %s\n" % columns
195 continue
196 cluster_id = fields[columns.index('cluster_id')]
197 gene_chromosome1 = fields[columns.index('gene_chromosome1')]
198 gene_chromosome2 = fields[columns.index('gene_chromosome2')]
199 genomic_strand1 = fields[columns.index('genomic_strand1')]
200 genomic_strand2 = fields[columns.index('genomic_strand2')]
201 gene1 = fields[columns.index('gene1')]
202 gene2 = fields[columns.index('gene2')]
203 gene_info = 'GENEID=%s,%s' % (gene1,gene2)
204 gene_name1 = fields[columns.index('gene_name1')]
205 gene_name2 = fields[columns.index('gene_name2')]
206 gene_name_info = 'GENE=%s,%s' % (gene_name1,gene_name2)
207 gene_location1 = fields[columns.index('gene_location1')]
208 gene_location2 = fields[columns.index('gene_location2')]
209 gene_loc = 'GENELOC=%s,%s' % (gene_location1,gene_location2)
210 expression1 = int(fields[columns.index('expression1')])
211 expression2 = int(fields[columns.index('expression2')])
212 expr = 'EXPR=%d,%d' % (expression1,expression2)
213 genomic_break_pos1 = int(fields[columns.index('genomic_break_pos1')])
214 genomic_break_pos2 = int(fields[columns.index('genomic_break_pos2')])
215 breakpoint_homology = int(fields[columns.index('breakpoint_homology')])
216 homlen = 'HOMLEN=%s' % breakpoint_homology
217 orf = fields[columns.index('orf')] == 'Y'
218 exonboundaries = fields[columns.index('exonboundaries')] == 'Y'
219 read_through = fields[columns.index('read_through')] == 'Y'
220 interchromosomal = fields[columns.index('interchromosomal')] == 'Y'
221 adjacent = fields[columns.index('adjacent')] == 'Y'
222 altsplice = fields[columns.index('altsplice')] == 'Y'
223 deletion = fields[columns.index('deletion')] == 'Y'
224 eversion = fields[columns.index('eversion')] == 'Y'
225 inversion = fields[columns.index('inversion')] == 'Y'
226 span_count = int(fields[columns.index('span_count')])
227 splitr_count = int(fields[columns.index('splitr_count')])
228 splice_score = int(fields[columns.index('splice_score')])
229 probability = fields[columns.index('probability')] if columns.index('probability') else '.'
230 splitr_sequence = fields[columns.index('splitr_sequence')]
231 split_seqs = splitr_sequence.split('|')
232 mate_id1 = "bnd_%s_1" % cluster_id
233 mate_id2 = "bnd_%s_2" % cluster_id
234 ref1 = split_seqs[0][-1]
235 ref2 = split_seqs[1][0]
236 b1 = '[' if genomic_strand1 == '+' else ']'
237 b2 = '[' if genomic_strand2 == '+' else ']'
238 alt1 = "%s%s%s:%d%s" % (ref1,b2,gene_chromosome2,genomic_break_pos2,b2)
239 alt2 = "%s%s:%d%s%s" % (b1,gene_chromosome1,genomic_break_pos1,b1,ref2)
240 #TODO evaluate what should be included in the INFO field
241 info = ['DP=%d' % (span_count + splitr_count),'SPLITCNT=%d' % splitr_count,'SPANCNT=%d' % span_count,gene_name_info,gene_info,gene_loc,expr,homlen,'SPLICESCORE=%d' % splice_score]
242 if orf:
243 info.append('ORF')
244 if exonboundaries:
245 info.append('EXONBND')
246 if interchromosomal:
247 info.append('INTERCHROM')
248 if read_through:
249 info.append('READTHROUGH')
250 if adjacent:
251 info.append('ADJACENT')
252 if altsplice:
253 info.append('ALTSPLICE')
254 if deletion:
255 info.append('DELETION')
256 if eversion:
257 info.append('EVERSION')
258 if inversion:
259 info.append('INVERSION')
260 info1 = [svtype,'MATEID=%s;MATELOC=%s:%d' % (mate_id2,gene_chromosome2,genomic_break_pos2)] + info
261 info2 = [svtype,'MATEID=%s;MATELOC=%s:%d' % (mate_id1,gene_chromosome1,genomic_break_pos1)] + info
262 qual = int(float(fields[columns.index('probability')]) * 255) if columns.index('probability') else '.'
263 vcf1 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome1,genomic_break_pos1, mate_id1, ref1, alt1, qual, filt, ';'.join(info1) )
264 vcf2 = '%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s'% (gene_chromosome2,genomic_break_pos2, mate_id2, ref2, alt2, qual, filt, ';'.join(info2) )
265 add_vcf_line(gene_chromosome1,genomic_break_pos1,mate_id1,vcf1)
266 add_vcf_line(gene_chromosome2,genomic_break_pos2,mate_id2,vcf2)
267 write_vcf()
268 except Exception, e:
269 print >> sys.stderr, "failed: %s" % e
270 sys.exit(1)
271
272 if __name__ == "__main__" : __main__()
273