annotate snpEff_cds_report.py @ 4:429a6b4df5e5

Restrict to info items that are key=value pairs
author Jim Johnson <jj@umn.edu>
date Wed, 24 Apr 2013 08:00:04 -0500
parents 15a54fa11ad7
children 85b933b7d231
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/python
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
2 import sys,os,tempfile,re
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
3 import httplib, urllib
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
4 import optparse
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
5 #import vcfClass
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
6 #from vcfClass import *
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
7 #import tools
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
8 #from tools import *
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
9
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
10 debug = False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
11 cds_anchor = 'cds_variation'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
12 aa_anchor = 'aa_variation'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
13
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
14 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
15 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
16 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
17 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
18 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
19 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
20 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
21 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
22 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
23 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
24 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
25 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
26 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
27 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
28 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
29 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
30
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
31 def reverseComplement(seq) :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
32 rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
33 return rev
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
34
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
35 def translate(seq) :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
36 rna = seq.upper().replace('T','U')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
37 aa = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
38 for i in range(0,len(rna) - 2, 3):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
39 aa.append(codon_map[rna[i:i+3]])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
40 return ''.join(aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
41
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
42 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
43 SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
44 Queries that request a sequence always return the sequence in the first column regardless of the order of attributes.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
45 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
46 query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
47 <!DOCTYPE Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
48 <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
49 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
50 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
51 <Filter name = "with_ccds" excluded = "0"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
52 <Attribute name = "ensembl_transcript_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
53 <Attribute name = "ccds" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
54 </Dataset>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
55 </Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
56 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
57 ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
58 query_template = """<?xml version="1.0" encoding="UTF-8"?>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
59 <!DOCTYPE Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
60 <Query virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
61 <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
62 <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
63 <Filter name = "biotype" value = "protein_coding"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
64 __CCDS_FILTER_HERE__
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
65 <Attribute name = "cdna" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
66 <Attribute name = "ensembl_gene_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
67 <Attribute name = "ensembl_transcript_id" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
68 <Attribute name = "strand" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
69 <Attribute name = "transcript_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
70 <Attribute name = "transcript_end"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
71 <Attribute name = "exon_chrom_start"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
72 <Attribute name = "exon_chrom_end"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
73 <Attribute name = "cdna_coding_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
74 <Attribute name = "cdna_coding_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
75 <Attribute name = "cds_length"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
76 <Attribute name = "rank"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
77 <Attribute name = "5_utr_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
78 <Attribute name = "5_utr_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
79 <Attribute name = "3_utr_start" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
80 <Attribute name = "3_utr_end" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
81 <Attribute name = "ensembl_peptide_id"/>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
82 <Attribute name = "start_position" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
83 <Attribute name = "end_position" />
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
84 </Dataset>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
85 </Query>
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
86 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
87 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
88 Columns(19):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
89 ['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)']
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
90
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
91 0 cDNA sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
92 1 Ensembl Gene ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
93 2 Ensembl Transcript ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
94 3 Strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
95 - 4 Transcript Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
96 - 5 Transcript End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
97 6 Exon Chr Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
98 7 Exon Chr End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
99 8 CDS Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
100 9 CDS End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
101 10 CDS Length
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
102 - 11 Exon Rank in Transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
103 12 5' UTR Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
104 13 5' UTR End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
105 14 3' UTR Start
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
106 15 3' UTR End
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
107 - 16 Ensembl Protein ID
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
108 - 17 Gene Start (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
109 - 18 Gene End (bp)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
110 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
111
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
112 # return {transcript_id : ccds_id}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
113 def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
114 ccds_dict = dict()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
115 if transcript_ids:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
116 query = query_ccds_template
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
117 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
118 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
119 params = urllib.urlencode({'query':query})
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
120 headers = {"Accept": "text/plain"}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
121 if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
122 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
123 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
124 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
125 conn.request("POST", "/biomart/martservice", params, headers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
126 response = conn.getresponse()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
127 data = response.read()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
128 if len(data) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
129 # if debug: print >> sys.stdout, "%s\n\n" % data
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
130 lines = data.split('\n')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
131 for line in lines:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
132 fields = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
133 if len(fields) == 2:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
134 (transcript_id,ccds) = fields
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
135 ccds_dict[transcript_id] = ccds
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
136 if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
137 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
138 sys.stdout.write( "transcript_ids: %s %s\n" % (transcript_ids, e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
139 return ccds_dict
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
140
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
141 def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
142 query = query_template
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
143 query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
144 query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
145 query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
146 return query
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
147
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
148 # return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
149 def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
150 transcript_id = snpEffect.transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
151 chr_pos = snpEffect.pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
152 query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
153 if debug: print >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
154 params = urllib.urlencode({'query':query})
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
155 headers = {"Accept": "text/plain"}
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
156 pos = snpEffect.pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
157 cds_pos = None # zero based offset
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
158 coding_sequence = ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
159 cds_ref = snpEffect.ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
160 cds_alt = snpEffect.alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
161 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
162 if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
163 conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
164 conn.request("POST", "/biomart/martservice", params, headers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
165 response = conn.getresponse()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
166 data = response.read()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
167 if len(data) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
168 # if debug: print >> sys.stdout, "%s\n\n" % data
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
169 lines = data.split('\n')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
170 # Use the header line to determine the order of fields
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
171 line = lines[0]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
172 # if debug: print >> sys.stdout, "Headers:\n%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
173 colnames = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
174 # for i in range(len(colnames)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
175 #
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
176 if debug: print >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
177 for line in lines[1:]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
178 if line == None or len(line) < 2:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
179 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
180 field = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
181 if len(field) != len(colnames):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
182 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
183 if debug: print >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
184 if field[10] == None or len(field[10]) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
185 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
186 if debug:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
187 for i in range(len(colnames)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
188 print >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
189 snpEffect.gene_id = field[1]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
190 strand = '+' if int(field[3]) > 0 else '-'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
191 snpEffect.strand = strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
192 # coding_sequence is first
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
193 if len(field) > 10 and re.match('[ATGCN]+',field[0]):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
194 if field[10] == None or len(field[10]) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
195 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
196 cdna_seq = field[0]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
197 in_utr = False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
198 # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
199 utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
200 utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
201 utr_5_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
202 for i in range(len(utr_5_starts)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
203 utr_5_start = int(utr_5_starts[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
204 utr_5_end = int(utr_5_ends[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
205 utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
206 if utr_5_start <= pos <= utr_5_end :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
207 in_utr = True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
208 if debug: print >> sys.stdout, "in utr_5: %s %s %s\n" % (utr_5_start,pos,utr_5_end);
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
209 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
210 utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
211 utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
212 for i in range(len(utr_3_starts)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
213 utr_3_start = int(utr_3_starts[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
214 utr_3_end = int(utr_3_ends[i])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
215 if utr_3_start <= pos <= utr_3_end :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
216 in_utr = True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
217 if debug: print >> sys.stdout, "in utr_3: %s %s %s\n" % (utr_3_start,pos,utr_3_end);
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
218 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
219 # Get coding sequence from cdna
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
220 cdna_length = int(field[10])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
221 cdna_coding_start = sorted(eval('[' + re.sub(';',',',field[8]) + ']'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
222 cdna_coding_end = sorted(eval('[' + re.sub(';',',',field[9]) + ']'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
223 cdna_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
224 for i in range(len(cdna_coding_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
225 cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
226 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
227 cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
228 for i in range(len(cdna_coding_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
229 if debug: print >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
230 coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
231 snpEffect.cds = coding_sequence
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
232 if coding_sequence and len(coding_sequence) >= 3:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
233 snpEffect.cds_stop_codon = coding_sequence[-3:]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
234 if debug: print >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
235 if debug: print >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
236 if debug: print >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
237 if not in_utr:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
238 exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
239 exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
240 exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-'))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
241 exon_lens = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
242 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
243 exon_lens.append(exon_end[i] - exon_start[i] + 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
244 if debug: print >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
245 if debug:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
246 bp_tot = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
247 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
248 exon_len = exon_end[i] - exon_start[i] + 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
249 bp_tot += exon_len
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
250 print >> sys.stdout, "test: %s %s %s %s %s %d %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
251 cds_idx = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
252 for i in range(len(exon_start)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
253 # Does this exon have cds bases - i.e. not entirely in UTR
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
254 if len(utr_5_ends) > 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
255 if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
256 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
257 if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
258 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
259 exon_len = exon_end[i] - exon_start[i] + 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
260 if exon_start[i] <= pos <= exon_end[i]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
261 if strand == '+':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
262 if cds_idx: # find offset from start of cdna_coding and exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
263 offset = pos - exon_start[i]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
264 else: # find offset from end of cdna_coding and exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
265 offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
266 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
267 else: # negative strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
268 cds_ref = reverseComplement(snpEffect.ref)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
269 cds_alt = reverseComplement(snpEffect.alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
270 offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
271 cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
272 snpEffect.cds_pos = cds_pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
273 snpEffect.cds_ref = cds_ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
274 snpEffect.cds_alt = cds_alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
275 return snpEffect
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
276 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
277 cds_idx += 1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
278 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
279 sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
280 finally:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
281 if conn != None :
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
282 conn.close()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
283 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
284
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
285 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
286 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
287 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
288 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
289 class SnpEffect( object ):
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
290 report_headings = ['Gene','Variant Position','Reference','Variation','Penetrance','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','Stop Region','AA Variation']
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
291 def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
292 self.chrom = chrom
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
293 self.pos = int(pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
294 self.ref = ref
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
295 self.alt = alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
296 self.freq = float(freq) if freq else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
297 self.depth = int(depth) if depth else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
298 # From SnpEffect VCF String
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
299 self.effect = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
300 self.effect_impact = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
301 self.functional_class = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
302 self.codon_change = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
303 self.amino_acid_change = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
304 self.amino_acid_length = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
305 self.gene_name = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
306 self.gene_id = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
307 self.gene_biotype = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
308 self.coding = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
309 self.transcript = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
310 self.transcript_ids = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
311 self.exon = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
312 self.cds_stop_codon = None
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
313 self.cds_stop_pos = None
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
314 # retrieve from ensembl
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
315 self.strand = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
316 self.cds_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
317 self.cds_ref = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
318 self.cds_alt = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
319 self.cds = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
320 self.aa_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
321 self.aa_len = None
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
322 self.alt_stop_pos = None
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
323 self.alt_stop_codon = None
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
324 self.alt_stop_region = None ## includes base before and after alt_stop_codon
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
325 def chrPos(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
326 return "%s:%s" % (self.chrom,self.pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
327 def setEffect(self, effect, snpEffVersion = None):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
328 if snpEffVersion and snpEffVersion == '2':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
329 (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
330 else: # SnpEffVersion 3 # adds Amino_Acid_length field
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
331 (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
332 self.amino_acid_length = amino_acid_length
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
333 self.effect_impact = effect_impact
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
334 self.functional_class = functional_class
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
335 self.codon_change = codon_change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
336 self.amino_acid_change = amino_acid_change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
337 self.gene_name = gene_name
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
338 self.gene_biotype = gene_biotype
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
339 self.coding = coding
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
340 self.transcript = transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
341 self.exon = exon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
342 def parseEffect(self, effect, snpEffVersion = None):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
343 (eff,effs) = effect.rstrip(')').split('(')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
344 self.effect = eff
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
345 eff_fields = effs.split('|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
346 if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
347 self.setEffect(eff_fields, snpEffVersion)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
348 def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
349 getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
350 return len(self.cds) > 0 if self.cds else False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
351 def score(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
352 return self.freq * self.depth if self.freq and self.depth else -1
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
353 def getCodingSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
354 return self.cds
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
355 def getVariantSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
356 if self.cds and self.cds_pos and self.cds_ref and self.cds_alt:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
357 return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
358 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
359 if debug: print >> sys.stdout, "getVariantSequence: %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
360 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
361 def getCodingAminoSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
362 seq = translate(self.cds) if self.cds else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
363 if seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
364 self.aa_len = len(seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
365 return seq
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
366 def getVariantAminoSequence(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
367 variant_seq = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
368 return translate(variant_seq) if variant_seq else None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
369 def getVariantPeptide(self,toStopCodon=True):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
370 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
371 if var_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
372 if toStopCodon:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
373 end_pos = var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
374 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
375 end_pos = len(var_aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
376 novel_peptide = var_aa[var_aa_pos:end_pos]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
377 return novel_peptide
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
378 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
379 # [preAA,varAA,postAA, varPeptide, varOffset, subAA]
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
380 def getNonSynonymousPeptide(self,start_count,end_count,toStopCodon=True):
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
381 (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
382 if var_aa:
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
383 start_pos = max(var_aa_pos - start_count,0) if start_count and int(start_count) >= 0 else 0
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
384 if toStopCodon:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
385 end_pos = var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
386 else:
1
29b286896c50 Change the default number of Amino Acids to display after a missense to 10.
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
387 end_offset = end_count + 1
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
388 end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
389 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
390 varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
391 if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
392 mutation = "p.%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
393 preAA = var_aa[start_pos:var_aa_pos] # N-term side
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
394 postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
395 novel_peptide = var_aa[start_pos:end_pos]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
396 return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
397 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
398 sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
399 return None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
400 # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
401 # positions are 0-based array indexes
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
402 def getSequences(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
403 coding_dna = self.getCodingSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
404 coding_aa = self.getCodingAminoSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
405 var_dna = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
406 var_aa = self.getVariantAminoSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
407 var_aa_pos = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
408 var_aa_end = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
409 if var_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
410 var_aa_pos = self.cds_pos / 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
411 for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
412 if var_aa[j] != coding_aa[j]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
413 var_aa_pos = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
414 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
415 self.aa_pos = var_aa_pos
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
416 var_stop = var_aa.find('*',var_aa_pos)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
417 if var_stop < 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
418 var_aa_end = len(var_aa)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
419 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
420 var_aa_end = var_stop + 1 # include the stop codon
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
421 stop_pos = var_stop * 3
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
422 self.alt_stop_pos = stop_pos
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
423 self.alt_stop_codon = var_dna[stop_pos:stop_pos+3]
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
424 self.alt_stop_region = (var_dna[stop_pos - 1] + '-' if stop_pos > 0 else '') + \
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
425 self.alt_stop_codon + \
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
426 ( '-' + var_dna[stop_pos+3] if len(var_dna) > stop_pos+3 else '')
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
427 return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
428 def inHomopolymer(self,polyA_limit):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
429 if polyA_limit: ## library prep can results in inserting or deleting an A in a poly A region
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
430 ## check if base changes at cds_pos involve A or T
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
431 bdiff = None # the difference between the cds_ref and cds_alt
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
432 boff = 0 # the offset to the difference
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
433 if len(self.cds_ref) < len(self.cds_alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
434 bdiff = re.sub(self.cds_ref,'',self.cds_alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
435 boff = self.cds_alt.find(bdiff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
436 elif len(self.cds_alt) < len(self.cds_ref):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
437 bdiff = re.sub(self.cds_alt,'',self.cds_ref)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
438 boff = self.cds_ref.find(bdiff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
439 if bdiff:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
440 ## check the number of similar base neighboring the change
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
441 alt_seq = self.getVariantSequence()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
442 subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
443 ## adjust match length if the cps_pos is within polyA_limit form sequence end
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
444 match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
445 if debug: print >> sys.stdout, "polyA bdiff %s %s %d %d\n" % (bdiff,subseq, match_len, boff)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
446 ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
447 if bdiff.find('A') >= 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
448 pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
449 match = re.match(pat,subseq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
450 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
451 if match:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
452 return True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
453 if bdiff.find('T') >= 0:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
454 pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
455 match = re.match(pat,subseq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
456 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
457 if match:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
458 if debug: print >> sys.stdout, "%s %s %s %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups())
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
459 return True
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
460 return False
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
461 def getReportHeader():
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
462 return report_headings
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
463 def getReportFields(self):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
464 gene_name = self.gene_name if self.gene_name else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
465 cds_ref = self.cds_ref if self.cds_ref else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
466 cds_alt = self.cds_alt if self.cds_alt else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
467 hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
468 if debug: print >> sys.stdout, "HGVS %s" % hgvs
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
469 var_aa = self.getVariantPeptide(toStopCodon=True)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
470 freq = "%2.2f" % self.freq if self.freq else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
471 depth = "%d" % self.depth if self.depth else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
472 aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
473 aa_len = "%d" % self.aa_len if self.aa_len else ''
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
474 gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
475 stop_codon = self.alt_stop_codon if self.alt_stop_codon else ''
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
476 stop_region = self.alt_stop_region if self.alt_stop_region else ''
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
477 chrpos = self.chrPos()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
478 aa_change = self.amino_acid_change if self.amino_acid_change else hgvs[5]
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
479 return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,stop_region,var_aa if var_aa else '']
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
480
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
481 def __main__():
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
482
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
483 def printReportTsv(output_file, snpEffects):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
484 if output_file:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
485 print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
486 for snpEffect in snpEffects:
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
487 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields()
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
488 if not snpEffect.effect == 'FRAME_SHIFT':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
489 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
490 novel_peptide = "%s_%s_%s" % (preAA,varAA,postAA)
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
491 print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,novel_peptide])
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
492
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
493 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
494 http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
495 http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
496 http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
497 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
498 def printReportHtml(output_file, detail_dir, snpEffects):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
499 if output_file:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
500 print >> output_file, '<HTML><BODY>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
501 print >> output_file, '<TABLE BORDER="1">\n'
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
502 print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Penetrance</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n'
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
503 for snpEffect in snpEffects:
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
504 (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields()
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
505 refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
506 aa_ref = '#'.join([refname,aa_anchor])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
507 refpath = os.path.join(detail_dir,refname)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
508 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
509 ref_file = open(refpath,'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
510 printEffDetailHtml(ref_file,snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
511 ref_file.close()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
512 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
513 sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
514 if snpEffect.effect == 'FRAME_SHIFT':
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
515 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref,novel_peptide)
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
516 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
517 (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
2
15a54fa11ad7 snpEff_cds_report report base before and after stop codon of variant sequence
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
518 print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref, preAA,varAA,postAA)
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
519 print >> output_file, '</TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
520 print >> output_file, '</BODY></HTML>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
521
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
522 def printEffDetailHtml(output_file,snpEffect,wrap_len=60):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
523 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
524 seq_len = len(coding_seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
525 print >> output_file, '<HTML><BODY>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
526 print >> output_file, '<TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
527 print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
528 print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
529 print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
530 print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
531 print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
532 print >> output_file, '</TABLE>\n'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
533 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
534 alt_aa_pos = pos / 3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
535 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
536 for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
537 if alt_aa[j] != coding_aa[j]:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
538 alt_aa_pos = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
539 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
540 alt_aa_end = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
541 for j in range(alt_aa_pos,len(alt_aa)):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
542 if alt_aa[j] == '*':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
543 alt_aa_end = j
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
544 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
545 seq = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
546 for j in range(1,wrap_len+1):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
547 seq.append('-' if j % 10 else '|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
548 hrule = '</TD><TD>'.join(seq)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
549 print >> output_file, '<TABLE cellspacing="0">'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
550 for i in range(0,seq_len,wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
551 e = min(i + wrap_len,seq_len)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
552 sa = i/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
553 ea = (i+wrap_len)/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
554 print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
555 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
556 for j in range(i,i + wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
557 if j < len(coding_seq):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
558 if pos == j:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
559 print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
560 elif pos <= j < pos + len(ref):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
561 print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
562 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
563 print >> output_file, "<TD>%s</TD>" % coding_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
564 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
565 print >> output_file, "<TD></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
566 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
567 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
568 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
569 for j in range(sa,ea):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
570 if j < len(coding_aa):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
571 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
572 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
573 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
574 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
575 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
576 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
577 for j in range(sa,ea):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
578 if j < len(alt_aa):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
579 if alt_aa_pos == j:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
580 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j])
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
581 elif alt_aa_pos < j <= alt_aa_end:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
582 print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
583 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
584 print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
585 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
586 print >> output_file, "<TD COLSPAN=\"3\"></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
587 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
588 print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
589 for j in range(i,i + wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
590 if j < len(alt_seq):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
591 if pos <= j < pos + len(alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
592 print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
593 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
594 print >> output_file, "<TD>%s</TD>" % alt_seq[j]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
595 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
596 print >> output_file, "<TD></TD>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
597 print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
598 print >> output_file, "</TABLE>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
599 print >> output_file, "</BODY></HTML>"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
600
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
601 def printSeqText(output_file,snpEffect, wrap_len=120):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
602 nw = 6
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
603 (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
604 if coding_seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
605 seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
606 rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|'
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
607 for i in range(0,seq_len,wrap_len):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
608 e = min(i + wrap_len,seq_len)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
609 sa = i/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
610 ea = e/3
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
611 print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
612 print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
613 if coding_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
614 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(coding_aa[sa:ea]),nw,ea)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
615 if alt_aa:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
616 print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,' '.join(alt_aa[sa:ea]),nw,ea)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
617 if alt_seq:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
618 print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
619
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
620 # Parse the command line options
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
621 usage = "Usage: snpEff_cds_report.py filter [options]"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
622 parser = optparse.OptionParser(usage = usage)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
623 parser.add_option("-i", "--in",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
624 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
625 dest="vcfFile", help="input vcf file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
626 parser.add_option("-o", "--out",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
627 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
628 dest="output", help="output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
629 parser.add_option("-H", "--html_report",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
630 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
631 dest="html_file", help="html output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
632 parser.add_option("-D", "--html_dir",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
633 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
634 dest="html_dir", help="html output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
635 parser.add_option("-t", "--tsv_file",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
636 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
637 dest="tsv_file", help="TAB Separated Value (.tsv) output report file")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
638 parser.add_option("-e", "--ensembl_host",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
639 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
640 dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
641 parser.add_option("-f", "--effects_filter",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
642 action="store", type="string", default=None,
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
643 dest="effects_filter", help="a list of effects to filter")
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
644 parser.add_option("-O", "--ensembl_dataset",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
645 action="store", type="string",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
646 dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
647 parser.add_option("-p", "--polyA_limit",
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
648 action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
649 parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
650 parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
651 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
652
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
653 (options, args) = parser.parse_args()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
654
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
655 debug = options.debug
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
656 ensembl_host = options.ensembl_host
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
657
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
658 # Check that a single vcf file is given.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
659 if options.vcfFile == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
660 parser.print_help()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
661 print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report "
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
662 exit(1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
663 outputFile = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
664 outputHtml = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
665 detailDir = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
666 outputTsv = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
667 tmpHtmlName = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
668 tmpHtml = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
669 effects_list = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
670 # Set the output file to stdout if no output file was specified.
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
671 if options.output == None and options.html_file == None and options.tsv_file == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
672 outputFile = sys.stdout
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
673 if options.output != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
674 output = os.path.abspath(options.output)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
675 outputFile = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
676 if options.tsv_file != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
677 output = os.path.abspath(options.tsv_file)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
678 outputTsv = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
679 if options.html_file != None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
680 output = os.path.abspath(options.html_file)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
681 outputHtml = open(output, 'w')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
682 if options.html_dir == None:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
683 detailDir = os.path.dirname(os.path.abspath(output))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
684 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
685 detailDir = options.html_dir
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
686 if detailDir:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
687 if not os.path.isdir(detailDir):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
688 os.makedirs(detailDir)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
689 if options.effects_filter:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
690 eff_codes = options.effects_filter.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
691 for val in eff_codes:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
692 code = val.strip()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
693 if code:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
694 effects_list.append(code)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
695
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
696 #
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
697 # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
698 try:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
699 snpEffects = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
700 homopolymers = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
701 polya_count = 0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
702 polya_list = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
703 fh = open( options.vcfFile )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
704 while True:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
705 line = fh.readline()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
706 if not line:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
707 break #EOF
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
708 if line:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
709 if outputFile:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
710 print >> outputFile, "%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
711 line = line.strip()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
712 if len(line) < 1:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
713 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
714 elif line.startswith('#'):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
715 # check for SnpEffVersion
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
716 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
717 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
718 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
719 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
720 if line.startswith('##SnpEffVersion='):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
721 m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
722 if m:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
723 SnpEffVersion = m.group(2)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
724 # check for SnpEff Info Description
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
725 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
726 ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
727 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
728 ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
729 ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
730 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
731 pass
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
732 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
733 if debug: print >> sys.stdout, "%s\n" % line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
734 freq = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
735 depth = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
736 fields = line.split('\t')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
737 (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
738 if debug: print >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
739 if options.debug: print(fields)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
740 for idx,alt in enumerate(alts.split(',')):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
741 if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
742 if not re.match('^[ATCG]*$',alt):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
743 print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s %s\n" % (chrom,pos,alt)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
744 for info_item in info.split(';'):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
745 try:
4
429a6b4df5e5 Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
746 if info_item.find('=') < 0:
429a6b4df5e5 Restrict to info items that are key=value pairs
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
747 continue
0
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
748 (key,val) = info_item.split('=',1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
749 if key == 'SAF': # Usually a SAF for each variant: if A,AG then SAF=0.333333333333333,SAF=0.333333333333333;
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
750 freqs = info_item.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
751 (k,v) = freqs[idx].split('=',1)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
752 freq = v
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
753 elif key == 'DP':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
754 depth = val
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
755 elif key == 'EFF':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
756 transcript_ids = []
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
757 effects = val.split(',')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
758 ccds_dict = None
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
759 for effect in effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
760 (eff,effs) = effect.rstrip(')').split('(')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
761 eff_fields = effs.split('|')
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
762 if SnpEffVersion == '2':
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
763 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
764 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
765 EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
766 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
767 (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
768 else: # SnpEffVersion 3 # adds Amino_Acid_length field
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
769 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
770 Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
771 FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
772 """
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
773 (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
774 if transcript:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
775 transcript_ids.append(transcript)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
776 if debug: print >> sys.stdout, "transcripts: %s" % (transcript_ids)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
777 if options.with_ccds:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
778 ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
779 if debug: print >> sys.stdout, "ccds_dict: %s" % (ccds_dict)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
780 for effect in effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
781 snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
782 snpEffect.transcript_ids = transcript_ids
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
783 snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
784 if effects_list and not snpEffect.effect in effects_list:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
785 continue
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
786 if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
787 if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
788 if snpEffect.cds_pos:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
789 if snpEffect.inHomopolymer(options.polyA_limit):
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
790 homopolymers.append(snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
791 else:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
792 snpEffects.append(snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
793 if outputFile:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
794 print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields())
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
795 printSeqText(outputFile,snpEffect)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
796 if snpEffect.cds and snpEffect.cds_pos and not options.all_effects:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
797 ## Just report the first sequence returned for a vcf line
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
798 break
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
799 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
800 sys.stderr.write( "line: %s err: %s\n" % (line,e) )
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
801 print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
802 # Sort snpEffects
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
803 snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
804 if outputHtml:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
805 printReportHtml(outputHtml, detailDir, snpEffects)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
806 if outputTsv:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
807 printReportTsv(outputTsv,snpEffects)
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
808 except Exception, e:
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
809 print(str(e))
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
810
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
811 if __name__ == "__main__": __main__()
cdbdac66d6b5 Uploaded
jjohnson
parents:
diff changeset
812