annotate snpEff_cds_report.py @ 0:cdbdac66d6b5

Uploaded
author jjohnson
date Tue, 12 Mar 2013 13:59:14 -0400
parents
children 29b286896c50
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/python
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
2 import sys,os,tempfile,re
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
3 import httplib, urllib
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
4 import optparse
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
5 #import vcfClass
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
6 #from vcfClass import *
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
7 #import tools
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
8 #from tools import *
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
9
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
10 debug = False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
11 cds_anchor = 'cds_variation'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
12 aa_anchor = 'aa_variation'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
13
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
14 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
15 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
16 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
17 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
18 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
19 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
20 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
21 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
22 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
23 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
24 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
25 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
26 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
27 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
28 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
29 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
30
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
31 def reverseComplement(seq) :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
32 rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
33 return rev
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
34
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
35 def translate(seq) :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
36 rna = seq.upper().replace('T','U')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
37 aa = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
38 for i in range(0,len(rna) - 2, 3):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
39 aa.append(codon_map[rna[i:i+3]])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
40 return ''.join(aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
41
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
42 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
43 SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
44 Queries that request a sequence always return the sequence in the first column regardless of the order of attributes.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
45 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
46 query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
47 <!DOCTYPE Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
48 <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
49 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
50 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
51 <Filter name = "with_ccds" excluded = "0"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
52 <Attribute name = "ensembl_transcript_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
53 <Attribute name = "ccds" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
54 </Dataset>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
55 </Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
56 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
57 ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
58 query_template = """<?xml version="1.0" encoding="UTF-8"?>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
59 <!DOCTYPE Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
60 <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
61 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
62 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
63 <Filter name = "biotype" value = "protein_coding"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
64 __CCDS_FILTER_HERE__
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
65 <Attribute name = "cdna" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
66 <Attribute name = "ensembl_gene_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
67 <Attribute name = "ensembl_transcript_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
68 <Attribute name = "strand" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
69 <Attribute name = "transcript_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
70 <Attribute name = "transcript_end"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
71 <Attribute name = "exon_chrom_start"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
72 <Attribute name = "exon_chrom_end"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
73 <Attribute name = "cdna_coding_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
74 <Attribute name = "cdna_coding_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
75 <Attribute name = "cds_length"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
76 <Attribute name = "rank"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
77 <Attribute name = "5_utr_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
78 <Attribute name = "5_utr_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
79 <Attribute name = "3_utr_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
80 <Attribute name = "3_utr_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
81 <Attribute name = "ensembl_peptide_id"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
82 <Attribute name = "start_position" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
83 <Attribute name = "end_position" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
84 </Dataset>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
85 </Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
86 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
87 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
88 Columns(19):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
89 ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)']
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
90
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
91 0 cDNA sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
92 1 Ensembl Gene ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
93 2 Ensembl Transcript ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
94 3 Strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
95 - 4 Transcript Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
96 - 5 Transcript End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
97 6 Exon Chr Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
98 7 Exon Chr End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
99 8 CDS Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
100 9 CDS End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
101 10 CDS Length
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
102 - 11 Exon Rank in Transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
103 12 5' UTR Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
104 13 5' UTR End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
105 14 3' UTR Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
106 15 3' UTR End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
107 - 16 Ensembl Protein ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
108 - 17 Gene Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
109 - 18 Gene End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
110 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
111
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
112 # return {transcript_id : ccds_id}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
113 def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
114 ccds_dict = dict()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
115 if transcript_ids:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
116 query = query_ccds_template
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
117 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
118 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
119 params = urllib.urlencode({'query':query})
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
120 headers = {"Accept": "text/plain"}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
121 if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
122 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
123 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
124 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
125 conn.request("POST", "/biomart/martservice", params, headers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
126 response = conn.getresponse()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
127 data = response.read()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
128 if len(data) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
129 # if debug: print >> sys.stdout, "%s\n\n" % data
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
130 lines = data.split('\n')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
131 for line in lines:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
132 fields = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
133 if len(fields) == 2:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
134 (transcript_id,ccds) = fields
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
135 ccds_dict[transcript_id] = ccds
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
136 if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
137 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
138 sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
139 return ccds_dict
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
140
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
141 def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
142 query = query_template
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
143 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
144 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
145 query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
146 return query
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
147
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
148 # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
149 def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
150 transcript_id = snpEffect.transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
151 chr_pos = snpEffect.pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
152 query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
153 if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
154 params = urllib.urlencode({'query':query})
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
155 headers = {"Accept": "text/plain"}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
156 pos = snpEffect.pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
157 cds_pos = None # zero based offset
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
158 coding_sequence = ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
159 cds_ref = snpEffect.ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
160 cds_alt = snpEffect.alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
161 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
162 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
163 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
164 conn.request("POST", "/biomart/martservice", params, headers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
165 response = conn.getresponse()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
166 data = response.read()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
167 if len(data) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
168 # if debug: print >> sys.stdout, "%s\n\n" % data
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
169 lines = data.split('\n')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
170 # Use the header line to determine the order of fields
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
171 line = lines[0]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
172 # if debug: print >> sys.stdout, "Headers:\n%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
173 colnames = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
174 # for i in range(len(colnames)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
175 #
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
176 if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
177 for line in lines[1:]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
178 if line == None or len(line) < 2:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
179 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
180 field = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
181 if len(field) != len(colnames):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
182 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
183 if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
184 if field[10] == None or len(field[10]) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
185 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
186 if debug:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
187 for i in range(len(colnames)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
188 print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
189 snpEffect.gene_id = field[1]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
190 strand = '+' if int(field[3]) > 0 else '-'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
191 snpEffect.strand = strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
192 # coding_sequence is first
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
193 if len(field) > 10 and re.match('[ATGCN]+',field[0]):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
194 if field[10] == None or len(field[10]) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
195 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
196 cdna_seq = field[0]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
197 in_utr = False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
198 # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
199 utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
200 utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
201 utr_5_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
202 for i in range(len(utr_5_starts)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
203 utr_5_start = int(utr_5_starts[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
204 utr_5_end = int(utr_5_ends[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
205 utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
206 if utr_5_start <= pos <= utr_5_end :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
207 in_utr = True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
208 if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end);
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
209 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
210 utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
211 utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
212 for i in range(len(utr_3_starts)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
213 utr_3_start = int(utr_3_starts[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
214 utr_3_end = int(utr_3_ends[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
215 if utr_3_start <= pos <= utr_3_end :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
216 in_utr = True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
217 if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end);
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
218 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
219 # Get coding sequence from cdna
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
220 cdna_length = int(field[10])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
221 cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
222 cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
223 cdna_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
224 for i in range(len(cdna_coding_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
225 cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
226 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
227 cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
228 for i in range(len(cdna_coding_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
229 if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
230 coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
231 snpEffect.cds = coding_sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
232 if coding_sequence and len(coding_sequence) >= 3:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
233 snpEffect.cds_stop_codon = coding_sequence[-3:]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
234 if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
235 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
236 if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
237 if not in_utr:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
238 exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
239 exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
240 exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
241 exon_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
242 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
243 exon_lens.append(exon_end[i] - exon_start[i] + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
244 if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
245 if debug:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
246 bp_tot = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
247 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
248 exon_len = exon_end[i] - exon_start[i] + 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
249 bp_tot += exon_len
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
250 print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
251 cds_idx = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
252 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
253 # Does this exon have cds bases - i.e. not entirely in UTR
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
254 if len(utr_5_ends) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
255 if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
256 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
257 if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
258 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
259 exon_len = exon_end[i] - exon_start[i] + 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
260 if exon_start[i] <= pos <= exon_end[i]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
261 if strand == '+':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
262 if cds_idx: # find offset from start of cdna_coding and exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
263 offset = pos - exon_start[i]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
264 else: # find offset from end of cdna_coding and exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
265 offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
266 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
267 else: # negative strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
268 cds_ref = reverseComplement(snpEffect.ref)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
269 cds_alt = reverseComplement(snpEffect.alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
270 offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
271 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
272 snpEffect.cds_pos = cds_pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
273 snpEffect.cds_ref = cds_ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
274 snpEffect.cds_alt = cds_alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
275 return snpEffect
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
276 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
277 cds_idx += 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
278 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
279 sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
280 finally:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
281 if conn != None :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
282 conn.close()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
283 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
284
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
285 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
286 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
287 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
288 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
289 class SnpEffect( object ):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
290 report_headings = ['Gene','Variant Position','Reference','Variation','Penetrance Percent','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','AA Variation']
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
291 def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
292 self.chrom = chrom
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
293 self.pos = int(pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
294 self.ref = ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
295 self.alt = alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
296 self.freq = float(freq) if freq else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
297 self.depth = int(depth) if depth else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
298 # From SnpEffect VCF String
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
299 self.effect = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
300 self.effect_impact = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
301 self.functional_class = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
302 self.codon_change = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
303 self.amino_acid_change = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
304 self.amino_acid_length = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
305 self.gene_name = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
306 self.gene_id = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
307 self.gene_biotype = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
308 self.coding = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
309 self.transcript = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
310 self.transcript_ids = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
311 self.exon = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
312 self.cds_stop_codon = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
313 # retrieve from ensembl
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
314 self.strand = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
315 self.cds_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
316 self.cds_ref = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
317 self.cds_alt = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
318 self.cds = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
319 self.aa_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
320 self.aa_len = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
321 self.alt_stop_codon = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
322 def chrPos(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
323 return "%s:%s" % (self.chrom,self.pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
324 def setEffect(self, effect, snpEffVersion = None):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
325 if snpEffVersion and snpEffVersion == '2':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
326 (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
327 else: # SnpEffVersion 3 # adds Amino_Acid_length field
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
328 (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
329 self.amino_acid_length = amino_acid_length
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
330 self.effect_impact = effect_impact
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
331 self.functional_class = functional_class
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
332 self.codon_change = codon_change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
333 self.amino_acid_change = amino_acid_change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
334 self.gene_name = gene_name
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
335 self.gene_biotype = gene_biotype
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
336 self.coding = coding
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
337 self.transcript = transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
338 self.exon = exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
339 def parseEffect(self, effect, snpEffVersion = None):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
340 (eff,effs) = effect.rstrip(')').split('(')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
341 self.effect = eff
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
342 eff_fields = effs.split('|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
343 if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
344 self.setEffect(eff_fields, snpEffVersion)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
345 def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
346 getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
347 return len(self.cds) > 0 if self.cds else False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
348 def score(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
349 return self.freq * self.depth if self.freq and self.depth else -1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
350 def getCodingSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
351 return self.cds
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
352 def getVariantSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
353 if self.cds and self.cds_pos and self.cds_ref and self.cds_alt:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
354 return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
355 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
356 if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
357 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
358 def getCodingAminoSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
359 seq = translate(self.cds) if self.cds else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
360 if seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
361 self.aa_len = len(seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
362 return seq
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
363 def getVariantAminoSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
364 variant_seq = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
365 return translate(variant_seq) if variant_seq else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
366 def getVariantPeptide(self,toStopCodon=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
367 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
368 if var_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
369 if toStopCodon:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
370 end_pos = var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
371 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
372 end_pos = len(var_aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
373 novel_peptide = var_aa[var_aa_pos:end_pos]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
374 return novel_peptide
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
375 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
376 # [preAA,varAA,postAA, varPeptide, varOffset, subAA]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
377 def getNonSynonymousPeptide(self,start_offset,end_offset,toStopCodon=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
378 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
379 if var_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
380 start_pos = max(var_aa_pos - start_offset,0) if start_offset and int(start_offset) >= 0 else 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
381 if toStopCodon:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
382 end_pos = var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
383 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
384 end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
385 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
386 varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
387 if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
388 mutation = "p.%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
389 preAA = var_aa[start_pos:var_aa_pos] # N-term side
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
390 postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
391 novel_peptide = var_aa[start_pos:end_pos]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
392 return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
393 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
394 sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
395 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
396 # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
397 # positions are 0-based array indexes
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
398 def getSequences(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
399 coding_dna = self.getCodingSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
400 coding_aa = self.getCodingAminoSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
401 var_dna = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
402 var_aa = self.getVariantAminoSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
403 var_aa_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
404 var_aa_end = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
405 if var_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
406 var_aa_pos = self.cds_pos / 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
407 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
408 if var_aa[j] != coding_aa[j]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
409 var_aa_pos = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
410 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
411 self.aa_pos = var_aa_pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
412 var_stop = var_aa.find('*',var_aa_pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
413 if var_stop < 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
414 var_aa_end = len(var_aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
415 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
416 var_aa_end = var_stop + 1 # include the stop codon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
417 stop_pos = var_stop * 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
418 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
419 return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
420 def inHomopolymer(self,polyA_limit):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
421 if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
422 ## check if base changes at cds_pos involve A or T
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
423 bdiff = None # the difference between the cds_ref and cds_alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
424 boff = 0 # the offset to the difference
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
425 if len(self.cds_ref) < len(self.cds_alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
426 bdiff = re.sub(self.cds_ref,'',self.cds_alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
427 boff = self.cds_alt.find(bdiff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
428 elif len(self.cds_alt) < len(self.cds_ref):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
429 bdiff = re.sub(self.cds_alt,'',self.cds_ref)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
430 boff = self.cds_ref.find(bdiff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
431 if bdiff:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
432 ## check the number of similar base neighboring the change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
433 alt_seq = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
434 subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
435 ## adjust match length if the cps_pos is within polyA_limit form sequence end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
436 match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
437 if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
438 ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
439 if bdiff.find('A') >= 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
440 pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
441 match = re.match(pat,subseq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
442 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
443 if match:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
444 return True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
445 if bdiff.find('T') >= 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
446 pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
447 match = re.match(pat,subseq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
448 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
449 if match:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
450 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups())
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
451 return True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
452 return False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
453 def getReportHeader():
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
454 return report_headings
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
455 def getReportFields(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
456 gene_name = self.gene_name if self.gene_name else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
457 cds_ref = self.cds_ref if self.cds_ref else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
458 cds_alt = self.cds_alt if self.cds_alt else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
459 hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
460 if debug: print >> sys.stdout, "HGVS %s" % hgvs
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
461 var_aa = self.getVariantPeptide(toStopCodon=True)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
462 freq = "%2.2f" % self.freq if self.freq else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
463 depth = "%d" % self.depth if self.depth else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
464 aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
465 aa_len = "%d" % self.aa_len if self.aa_len else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
466 gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
467 stop_codon = self.alt_stop_codon if self.alt_stop_codon else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
468 chrpos = self.chrPos()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
469 aa_change = self.amino_acid_change if self.amino_acid_change else hgvs[5]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
470 return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,var_aa if var_aa else '']
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
471
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
472 def __main__():
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
473
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
474 def printReportTsv(output_file, snpEffects):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
475 if output_file:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
476 print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
477 for snpEffect in snpEffects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
478 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
479 if not snpEffect.effect == 'FRAME_SHIFT':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
480 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
481 novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
482 print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
483
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
484 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
485 http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
486 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
487 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
488 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
489 def printReportHtml(output_file, detail_dir, snpEffects):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
490 if output_file:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
491 print >> output_file, '<HTML><BODY>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
492 print >> output_file, '<TABLE BORDER="1">\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
493 print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Penetrance Percent</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
494 for snpEffect in snpEffects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
495 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,novel_peptide) = snpEffect.getReportFields()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
496 refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
497 aa_ref = '#'.join([refname,aa_anchor])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
498 refpath = os.path.join(detail_dir,refname)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
499 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
500 ref_file = open(refpath,'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
501 printEffDetailHtml(ref_file,snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
502 ref_file.close()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
503 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
504 sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
505 if snpEffect.effect == 'FRAME_SHIFT':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
506 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref,novel_peptide)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
507 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
508 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
509 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,aa_ref, preAA,varAA,postAA)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
510 print >> output_file, '</TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
511 print >> output_file, '</BODY></HTML>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
512
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
513 def printEffDetailHtml(output_file,snpEffect,wrap_len=60):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
514 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
515 seq_len = len(coding_seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
516 print >> output_file, '<HTML><BODY>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
517 print >> output_file, '<TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
518 print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
519 print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
520 print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
521 print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
522 print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
523 print >> output_file, '</TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
524 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
525 alt_aa_pos = pos / 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
526 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
527 for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
528 if alt_aa[j] != coding_aa[j]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
529 alt_aa_pos = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
530 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
531 alt_aa_end = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
532 for j in range(alt_aa_pos,len(alt_aa)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
533 if alt_aa[j] == '*':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
534 alt_aa_end = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
535 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
536 seq = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
537 for j in range(1,wrap_len+1):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
538 seq.append('-' if j % 10 else '|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
539 hrule = '</TD><TD>'.join(seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
540 print >> output_file, '<TABLE cellspacing="0">'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
541 for i in range(0,seq_len,wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
542 e = min(i + wrap_len,seq_len)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
543 sa = i/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
544 ea = (i+wrap_len)/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
545 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
546 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
547 for j in range(i,i + wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
548 if j < len(coding_seq):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
549 if pos == j:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
550 print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
551 elif pos <= j < pos + len(ref):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
552 print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
553 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
554 print >> output_file, "<TD>%s</TD>" % coding_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
555 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
556 print >> output_file, "<TD></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
557 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
558 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
559 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
560 for j in range(sa,ea):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
561 if j < len(coding_aa):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
562 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
563 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
564 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
565 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
566 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
567 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
568 for j in range(sa,ea):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
569 if j < len(alt_aa):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
570 if alt_aa_pos == j:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
571 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
572 elif alt_aa_pos < j <= alt_aa_end:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
573 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
574 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
575 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
576 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
577 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
578 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
579 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
580 for j in range(i,i + wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
581 if j < len(alt_seq):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
582 if pos <= j < pos + len(alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
583 print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
584 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
585 print >> output_file, "<TD>%s</TD>" % alt_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
586 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
587 print >> output_file, "<TD></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
588 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
589 print >> output_file, "</TABLE>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
590 print >> output_file, "</BODY></HTML>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
591
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
592 def printSeqText(output_file,snpEffect, wrap_len=120):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
593 nw = 6
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
594 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
595 if coding_seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
596 seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
597 rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
598 for i in range(0,seq_len,wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
599 e = min(i + wrap_len,seq_len)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
600 sa = i/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
601 ea = e/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
602 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
603 print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
604 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
605 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
606 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
607 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
608 if alt_seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
609 print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
610
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
611 # Parse the command line options
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
612 usage = "Usage: snpEff_cds_report.py filter [options]"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
613 parser = optparse.OptionParser(usage = usage)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
614 parser.add_option("-i", "--in",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
615 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
616 dest="vcfFile", help="input vcf file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
617 parser.add_option("-o", "--out",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
618 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
619 dest="output", help="output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
620 parser.add_option("-H", "--html_report",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
621 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
622 dest="html_file", help="html output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
623 parser.add_option("-D", "--html_dir",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
624 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
625 dest="html_dir", help="html output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
626 parser.add_option("-t", "--tsv_file",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
627 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
628 dest="tsv_file", help="TAB Separated Value (.tsv) output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
629 parser.add_option("-e", "--ensembl_host",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
630 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
631 dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
632 parser.add_option("-f", "--effects_filter",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
633 action="store", type="string", default=None,
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
634 dest="effects_filter", help="a list of effects to filter")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
635 parser.add_option("-O", "--ensembl_dataset",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
636 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
637 dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
638 parser.add_option("-p", "--polyA_limit",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
639 action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
640 parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
641 parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
642 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
643
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
644 (options, args) = parser.parse_args()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
645
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
646 debug = options.debug
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
647 ensembl_host = options.ensembl_host
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
648
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
649 # Check that a single vcf file is given.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
650 if options.vcfFile == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
651 parser.print_help()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
652 print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report "
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
653 exit(1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
654 outputFile = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
655 outputHtml = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
656 detailDir = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
657 outputTsv = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
658 tmpHtmlName = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
659 tmpHtml = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
660 effects_list = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
661 # Set the output file to stdout if no output file was specified.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
662 if options.output == None and options.html_file == None and options.tsv_file == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
663 outputFile = sys.stdout
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
664 if options.output != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
665 output = os.path.abspath(options.output)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
666 outputFile = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
667 if options.tsv_file != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
668 output = os.path.abspath(options.tsv_file)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
669 outputTsv = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
670 if options.html_file != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
671 output = os.path.abspath(options.html_file)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
672 outputHtml = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
673 if options.html_dir == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
674 detailDir = os.path.dirname(os.path.abspath(output))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
675 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
676 detailDir = options.html_dir
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
677 if detailDir:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
678 if not os.path.isdir(detailDir):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
679 os.makedirs(detailDir)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
680 if options.effects_filter:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
681 eff_codes = options.effects_filter.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
682 for val in eff_codes:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
683 code = val.strip()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
684 if code:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
685 effects_list.append(code)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
686
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
687 #
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
688 # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
689 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
690 snpEffects = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
691 homopolymers = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
692 polya_count = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
693 polya_list = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
694 fh = open( options.vcfFile )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
695 while True:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
696 line = fh.readline()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
697 if not line:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
698 break #EOF
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
699 if line:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
700 if outputFile:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
701 print >> outputFile, "%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
702 line = line.strip()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
703 if len(line) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
704 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
705 elif line.startswith('#'):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
706 # check for SnpEffVersion
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
707 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
708 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
709 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
710 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
711 if line.startswith('##SnpEffVersion='):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
712 m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
713 if m:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
714 SnpEffVersion = m.group(2)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
715 # check for SnpEff Info Description
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
716 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
717 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
718 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
719 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
720 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
721 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
722 pass
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
723 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
724 if debug: print >> sys.stdout, "%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
725 freq = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
726 depth = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
727 fields = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
728 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
729 if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
730 if options.debug: print(fields)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
731 for idx,alt in enumerate(alts.split(',')):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
732 if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
733 if not re.match('^[ATCG]*$',alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
734 print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
735 for info_item in info.split(';'):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
736 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
737 (key,val) = info_item.split('=',1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
738 if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333;
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
739 freqs = info_item.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
740 (k,v) = freqs[idx].split('=',1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
741 freq = v
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
742 elif key == 'DP':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
743 depth = val
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
744 elif key == 'EFF':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
745 transcript_ids = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
746 effects = val.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
747 ccds_dict = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
748 for effect in effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
749 (eff,effs) = effect.rstrip(')').split('(')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
750 eff_fields = effs.split('|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
751 if SnpEffVersion == '2':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
752 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
753 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
754 EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
755 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
756 (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
757 else: # SnpEffVersion 3 # adds Amino_Acid_length field
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
758 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
759 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
760 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
761 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
762 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
763 if transcript:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
764 transcript_ids.append(transcript)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
765 if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
766 if options.with_ccds:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
767 ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
768 if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
769 for effect in effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
770 snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
771 snpEffect.transcript_ids = transcript_ids
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
772 snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
773 if effects_list and not snpEffect.effect in effects_list:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
774 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
775 if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
776 if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
777 if snpEffect.cds_pos:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
778 if snpEffect.inHomopolymer(options.polyA_limit):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
779 homopolymers.append(snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
780 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
781 snpEffects.append(snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
782 if outputFile:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
783 print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields())
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
784 printSeqText(outputFile,snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
785 if snpEffect.cds and snpEffect.cds_pos and not options.all_effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
786 ## Just report the first sequence returned for a vcf line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
787 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
788 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
789 sys.stderr.write( "line: %s err: %s\n" % (line,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
790 print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
791 # Sort snpEffects
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
792 snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
793 if outputHtml:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
794 printReportHtml(outputHtml, detailDir, snpEffects)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
795 if outputTsv:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
796 printReportTsv(outputTsv,snpEffects)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
797 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
798 print(str(e))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
799
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
800 if __name__ == "__main__": __main__()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
801