comparison ensembl_variant_report.py @ 1:a67b4de184c2 draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/ensembl_variant_report commit b62e927469f96a857e880c77530c50c885ad92fc-dirty
author jjohnson
date Thu, 14 Jun 2018 18:08:15 -0400
parents 9f4ea174ce3d
children f87fe6bc48f4
comparison
equal deleted inserted replaced
0:9f4ea174ce3d 1:a67b4de184c2
150 print >> sys.stderr, "Parsing gene model failed: %s" % e 150 print >> sys.stderr, "Parsing gene model failed: %s" % e
151 exit(2) 151 exit(2)
152 try: 152 try:
153 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf 153 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf
154 for tid,pos1,ref,alt,dp,freq in parse_input(): 154 for tid,pos1,ref,alt,dp,freq in parse_input():
155 if options.debug: print >> sys.stderr, "%s\t%d\t%s\t%s\t%d\t%f" % (tid,pos1,ref,alt,dp,freq)
155 if not tid: 156 if not tid:
156 continue 157 continue
157 if options.min_depth and dp is not None and dp < options.min_depth: 158 if options.min_depth and dp is not None and dp < options.min_depth:
158 continue 159 continue
159 if options.min_freq and freq is not None and freq < options.min_freq: 160 if options.min_freq and freq is not None and freq < options.min_freq:
160 continue 161 continue
161 ## transcript_id, pos, ref, alt, dp, dpr 162 try:
162 tx = ens_ref.get_gtf_transcript(tid) 163 ## transcript_id, pos, ref, alt, dp, dpr
163 if not tx: 164 tx = ens_ref.get_gtf_transcript(tid)
164 continue 165 if not tx and tid.find('.') > 0:
165 coding = ens_ref.transcript_is_coding(tid) 166 tid = tid.split('.')[0]
166 if not coding: 167 tx = ens_ref.get_gtf_transcript(tid)
167 continue 168 if not tx:
168 frame_shift = len(ref) != len(alt) 169 continue
169 cds = ens_ref.get_cds(tid) 170 if options.debug: print >> sys.stderr, "%s" % (tx)
170 pos0 = pos1 - 1 # zero based position 171 coding = ens_ref.transcript_is_coding(tid)
171 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 172 if not coding:
172 alt_seq = alt if tx.gene.strand else reverse_complement(alt) 173 continue
173 ref_seq = ref if tx.gene.strand else reverse_complement(ref) 174 frame_shift = len(ref) != len(alt)
174 cds_pos = ens_ref.genome_to_cds_pos(tid, spos) 175 cds = ens_ref.get_cds(tid)
175 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' 176 if options.debug or not cds: print >> sys.stderr, "cds:\n%s" % (cds)
176 offset = 0 177 pos0 = pos1 - 1 # zero based position
177 if tx.gene.strand: 178 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1
178 for i in range(min(len(ref),len(alt))): 179 alt_seq = alt if tx.gene.strand else reverse_complement(alt)
179 if ref[i] == alt[i]: 180 ref_seq = ref if tx.gene.strand else reverse_complement(ref)
180 offset = i 181 cds_pos = ens_ref.genome_to_cds_pos(tid, spos)
181 else: 182 if options.debug: print >> sys.stderr, "cds_pos: %s" % (str(cds_pos))
182 break 183 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else ''
183 else: 184 offset = 0
184 for i in range(-1,-min(len(ref),len(alt)) -1,-1): 185 if tx.gene.strand:
185 if ref[i] == alt[i]: 186 for i in range(min(len(ref),len(alt))):
186 offset = i 187 if ref[i] == alt[i]:
187 else: 188 offset = i
188 break 189 else:
189 refpep = translate(cds[:len(cds)/3*3])
190 pep = translate(alt_cds[:len(alt_cds)/3*3])
191 peplen = len(pep)
192 aa_pos = (cds_pos + offset) / 3
193 if aa_pos >= len(pep):
194 print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt)
195 continue
196 if frame_shift:
197 #find stop_codons
198 nstops = 0
199 stop_codons = []
200 for i in range(aa_pos,peplen):
201 if refpep[i] != pep[i]:
202 aa_pos = i
203 break
204 for i in range(aa_pos,peplen):
205 if pep[i] == '*':
206 nstops += 1
207 stop_codons.append("%s-%s%s" % (alt_cds[i*3-1],alt_cds[i*3:i*3+3],"-%s" % alt_cds[i*3+4] if len(alt_cds) > i*3 else ''))
208 if nstops > options.readthrough:
209 reported_peptide = pep[aa_pos:i+1]
210 reported_stop_codon = ','.join(stop_codons)
211 break 190 break
212 else: 191 else:
213 reported_stop_codon = "%s-%s" % (alt_cds[peplen*3-4],alt_cds[peplen*3-3:peplen*3]) 192 for i in range(-1,-min(len(ref),len(alt)) -1,-1):
214 reported_peptide = "%s_%s_%s" % (pep[max(aa_pos-options.leading_aa,0):aa_pos], 193 if ref[i] == alt[i]:
215 pep[aa_pos], 194 offset = i
216 pep[aa_pos+1:min(aa_pos+1+options.trailing_aa,len(pep))]) 195 else:
217 cs_pos = aa_pos * 3 196 break
218 aa_pos = (cds_pos + offset) / 3 197 refpep = translate(cds[:len(cds)/3*3])
219 ref_codon = cds[cs_pos:cs_pos+3] 198 pep = translate(alt_cds[:len(alt_cds)/3*3])
220 ref_aa = translate(ref_codon) 199 peplen = len(pep)
221 alt_codon = alt_cds[cs_pos:cs_pos+3] 200 aa_pos = (cds_pos + offset) / 3
222 alt_aa = translate(alt_codon) 201 reported_stop_codon = ''
223 gene = tx.gene.names[0] 202 if aa_pos >= len(pep):
224 report_fields = [tx.gene.names[0], 203 print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s" % (aa_pos,len(pep),tid,pos1,ref,alt)
225 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), 204 continue
226 ref_seq, 205 if frame_shift:
227 alt_seq, 206 #find stop_codons
228 "%1.2f" % freq if freq is not None else '', 207 nstops = 0
229 str(dp), 208 stop_codons = []
230 "%s|%s" % (tx.gene.gene_id,tx.cdna_id), 209 for i in range(aa_pos,peplen):
231 "%d" % (aa_pos + 1), 210 if refpep[i] != pep[i]:
232 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa), 211 aa_pos = i
233 "%d" % len(pep), 212 break
234 reported_stop_codon, 213 reported_peptide = pep[aa_pos:]
235 reported_peptide 214 for i in range(aa_pos,peplen):
236 ] 215 if pep[i] == '*':
237 if options.debug: 216 nstops += 1
238 report_fields.append("%d %d %d %d %s %s" % (cds_pos, offset, cs_pos,aa_pos,ref_codon,alt_codon)) 217 stop_codons.append("%s-%s%s" % (alt_cds[i*3-1],alt_cds[i*3:i*3+3],"-%s" % alt_cds[i*3+4] if len(alt_cds) > i*3 else ''))
239 outputFile.write('\t'.join(report_fields)) 218 if nstops > options.readthrough:
240 if options.debug: 219 reported_peptide = pep[aa_pos:i+1]
241 print >> sys.stderr, "%s %s\n%s\n%s\n%s %s" % ( 220 reported_stop_codon = ','.join(stop_codons)
242 cds[cs_pos-6:cs_pos], cds[cs_pos:cs_pos+15], 221 break
243 translate(cds[cs_pos-6:cs_pos+15]), 222 else:
244 translate(alt_cds[cs_pos-6:cs_pos+15]), 223 reported_stop_codon = "%s-%s" % (alt_cds[peplen*3-4],alt_cds[peplen*3-3:peplen*3])
245 alt_cds[cs_pos-6:cs_pos], alt_cds[cs_pos:cs_pos+15]) 224 reported_peptide = "%s_%s_%s" % (pep[max(aa_pos-options.leading_aa,0):aa_pos],
246 outputFile.write('\n') 225 pep[aa_pos],
226 pep[aa_pos+1:min(aa_pos+1+options.trailing_aa,len(pep))])
227 cs_pos = aa_pos * 3
228 aa_pos = (cds_pos + offset) / 3
229 ref_codon = cds[cs_pos:cs_pos+3]
230 ref_aa = translate(ref_codon)
231 alt_codon = alt_cds[cs_pos:cs_pos+3]
232 alt_aa = translate(alt_codon)
233 gene = tx.gene.names[0]
234 report_fields = [tx.gene.names[0],
235 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'),
236 ref_seq,
237 alt_seq,
238 "%1.2f" % freq if freq is not None else '',
239 str(dp),
240 "%s|%s" % (tx.gene.gene_id,tx.cdna_id),
241 "%d" % (aa_pos + 1),
242 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa),
243 "%d" % len(pep),
244 reported_stop_codon,
245 reported_peptide,
246 tx.get_transcript_type_name()
247 ]
248 if options.debug:
249 report_fields.append("%d %d %d %d %s %s" % (cds_pos, offset, cs_pos,aa_pos,ref_codon,alt_codon))
250 outputFile.write('\t'.join(report_fields))
251 if options.debug:
252 print >> sys.stderr, "%s %s\n%s\n%s\n%s %s" % (
253 cds[cs_pos-6:cs_pos], cds[cs_pos:cs_pos+15],
254 translate(cds[cs_pos-6:cs_pos+15]),
255 translate(alt_cds[cs_pos-6:cs_pos+15]),
256 alt_cds[cs_pos-6:cs_pos], alt_cds[cs_pos:cs_pos+15])
257 outputFile.write('\n')
258 except Exception, e:
259 print >> sys.stderr, "failed: %s" % e
260 print >> sys.stderr, "%s\t%d\t%s\t%s\t%d\t%f\t%s" % (tid,pos1,ref,alt,dp,freq,e)
247 except Exception, e: 261 except Exception, e:
248 print >> sys.stderr, "failed: %s" % e 262 print >> sys.stderr, "failed: %s" % e
249 exit(1) 263 exit(1)
250 264
251 265