diff ensembl_variant_report.py @ 1:a67b4de184c2 draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/ensembl_variant_report commit b62e927469f96a857e880c77530c50c885ad92fc-dirty
author jjohnson
date Thu, 14 Jun 2018 18:08:15 -0400
parents 9f4ea174ce3d
children f87fe6bc48f4
line wrap: on
line diff
--- a/ensembl_variant_report.py	Thu Jun 14 17:51:39 2018 -0400
+++ b/ensembl_variant_report.py	Thu Jun 14 18:08:15 2018 -0400
@@ -152,98 +152,112 @@
     try:
         parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf
         for tid,pos1,ref,alt,dp,freq in parse_input():
+            if options.debug: print >> sys.stderr, "%s\t%d\t%s\t%s\t%d\t%f" % (tid,pos1,ref,alt,dp,freq)
             if not tid:
                 continue
             if options.min_depth and dp is not None and dp < options.min_depth:
                 continue
             if options.min_freq and freq is not None and freq < options.min_freq:
                 continue
-            ## transcript_id, pos, ref, alt, dp, dpr
-            tx = ens_ref.get_gtf_transcript(tid)
-            if not tx:
-                continue
-            coding = ens_ref.transcript_is_coding(tid)
-            if not coding:
-                continue
-            frame_shift = len(ref) != len(alt)
-            cds = ens_ref.get_cds(tid)
-            pos0 = pos1 - 1 # zero based position
-            spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1
-            alt_seq = alt if tx.gene.strand else reverse_complement(alt)
-            ref_seq = ref if tx.gene.strand else reverse_complement(ref)
-            cds_pos = ens_ref.genome_to_cds_pos(tid, spos)
-            alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' 
-            offset = 0
-            if tx.gene.strand:
-                for i in range(min(len(ref),len(alt))):
-                    if ref[i] == alt[i]:
-                        offset = i
-                    else:
-                        break
-            else:
-                for i in range(-1,-min(len(ref),len(alt)) -1,-1):
-                    if ref[i] == alt[i]:
-                        offset = i
-                    else:
-                        break
-            refpep = translate(cds[:len(cds)/3*3])
-            pep = translate(alt_cds[:len(alt_cds)/3*3])
-            peplen = len(pep)
-            aa_pos = (cds_pos + offset) / 3
-            if aa_pos >= len(pep):
-                print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt)
-                continue
-            if frame_shift:
-                #find stop_codons
-                nstops = 0
-                stop_codons = []
-                for i in range(aa_pos,peplen):
-                    if refpep[i] != pep[i]:
-                        aa_pos = i
-                        break
-                for i in range(aa_pos,peplen):
-                    if pep[i] == '*':
-                        nstops += 1
-                        stop_codons.append("%s-%s%s" % (alt_cds[i*3-1],alt_cds[i*3:i*3+3],"-%s" % alt_cds[i*3+4] if len(alt_cds) > i*3 else ''))
-                        if nstops > options.readthrough:
-                            reported_peptide = pep[aa_pos:i+1]
-                            reported_stop_codon = ','.join(stop_codons)
+            try:
+                ## transcript_id, pos, ref, alt, dp, dpr
+                tx = ens_ref.get_gtf_transcript(tid)
+                if not tx and tid.find('.') > 0:
+                    tid = tid.split('.')[0]
+                    tx = ens_ref.get_gtf_transcript(tid)
+                if not tx:
+                    continue
+                if options.debug: print >> sys.stderr, "%s" % (tx)
+                coding = ens_ref.transcript_is_coding(tid)
+                if not coding:
+                    continue
+                frame_shift = len(ref) != len(alt)
+                cds = ens_ref.get_cds(tid)
+                if options.debug or not cds: print >> sys.stderr, "cds:\n%s" % (cds)
+                pos0 = pos1 - 1 # zero based position
+                spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1
+                alt_seq = alt if tx.gene.strand else reverse_complement(alt)
+                ref_seq = ref if tx.gene.strand else reverse_complement(ref)
+                cds_pos = ens_ref.genome_to_cds_pos(tid, spos)
+                if options.debug: print >> sys.stderr, "cds_pos: %s" % (str(cds_pos))
+                alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' 
+                offset = 0
+                if tx.gene.strand:
+                    for i in range(min(len(ref),len(alt))):
+                        if ref[i] == alt[i]:
+                            offset = i
+                        else:
+                            break
+                else:
+                    for i in range(-1,-min(len(ref),len(alt)) -1,-1):
+                        if ref[i] == alt[i]:
+                            offset = i
+                        else:
                             break
-            else:
-                reported_stop_codon = "%s-%s" % (alt_cds[peplen*3-4],alt_cds[peplen*3-3:peplen*3])
-                reported_peptide = "%s_%s_%s" % (pep[max(aa_pos-options.leading_aa,0):aa_pos],
-                                                 pep[aa_pos],
-                                                 pep[aa_pos+1:min(aa_pos+1+options.trailing_aa,len(pep))])
-            cs_pos = aa_pos * 3
-            aa_pos = (cds_pos + offset) / 3
-            ref_codon = cds[cs_pos:cs_pos+3]
-            ref_aa = translate(ref_codon)
-            alt_codon = alt_cds[cs_pos:cs_pos+3]
-            alt_aa = translate(alt_codon)
-            gene = tx.gene.names[0]
-            report_fields = [tx.gene.names[0],
-             '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'),
-             ref_seq,
-             alt_seq,
-             "%1.2f" % freq if freq is not None else '',
-             str(dp),
-             "%s|%s" % (tx.gene.gene_id,tx.cdna_id),
-             "%d" % (aa_pos + 1),
-             "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa),
-             "%d" % len(pep),
-             reported_stop_codon,
-             reported_peptide
-             ]
-            if options.debug:
-                report_fields.append("%d %d %d %d  %s  %s" % (cds_pos, offset, cs_pos,aa_pos,ref_codon,alt_codon))
-            outputFile.write('\t'.join(report_fields))
-            if options.debug:
-                print >> sys.stderr, "%s %s\n%s\n%s\n%s %s" % (
-                         cds[cs_pos-6:cs_pos], cds[cs_pos:cs_pos+15],
-                         translate(cds[cs_pos-6:cs_pos+15]),
-                         translate(alt_cds[cs_pos-6:cs_pos+15]),
-                         alt_cds[cs_pos-6:cs_pos], alt_cds[cs_pos:cs_pos+15])
-            outputFile.write('\n')
+                refpep = translate(cds[:len(cds)/3*3])
+                pep = translate(alt_cds[:len(alt_cds)/3*3])
+                peplen = len(pep)
+                aa_pos = (cds_pos + offset) / 3
+                reported_stop_codon = ''
+                if aa_pos >= len(pep):
+                    print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s" % (aa_pos,len(pep),tid,pos1,ref,alt)
+                    continue
+                if frame_shift:
+                    #find stop_codons
+                    nstops = 0
+                    stop_codons = []
+                    for i in range(aa_pos,peplen):
+                        if refpep[i] != pep[i]:
+                            aa_pos = i
+                            break
+                    reported_peptide = pep[aa_pos:]
+                    for i in range(aa_pos,peplen):
+                        if pep[i] == '*':
+                            nstops += 1
+                            stop_codons.append("%s-%s%s" % (alt_cds[i*3-1],alt_cds[i*3:i*3+3],"-%s" % alt_cds[i*3+4] if len(alt_cds) > i*3 else ''))
+                            if nstops > options.readthrough:
+                                reported_peptide = pep[aa_pos:i+1]
+                                reported_stop_codon = ','.join(stop_codons)
+                                break
+                else:
+                    reported_stop_codon = "%s-%s" % (alt_cds[peplen*3-4],alt_cds[peplen*3-3:peplen*3])
+                    reported_peptide = "%s_%s_%s" % (pep[max(aa_pos-options.leading_aa,0):aa_pos],
+                                                     pep[aa_pos],
+                                                     pep[aa_pos+1:min(aa_pos+1+options.trailing_aa,len(pep))])
+                cs_pos = aa_pos * 3
+                aa_pos = (cds_pos + offset) / 3
+                ref_codon = cds[cs_pos:cs_pos+3]
+                ref_aa = translate(ref_codon)
+                alt_codon = alt_cds[cs_pos:cs_pos+3]
+                alt_aa = translate(alt_codon)
+                gene = tx.gene.names[0]
+                report_fields = [tx.gene.names[0],
+                 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'),
+                 ref_seq,
+                 alt_seq,
+                 "%1.2f" % freq if freq is not None else '',
+                 str(dp),
+                 "%s|%s" % (tx.gene.gene_id,tx.cdna_id),
+                 "%d" % (aa_pos + 1),
+                 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa),
+                 "%d" % len(pep),
+                 reported_stop_codon,
+                 reported_peptide,
+                 tx.get_transcript_type_name()
+                 ]
+                if options.debug:
+                    report_fields.append("%d %d %d %d  %s  %s" % (cds_pos, offset, cs_pos,aa_pos,ref_codon,alt_codon))
+                outputFile.write('\t'.join(report_fields))
+                if options.debug:
+                    print >> sys.stderr, "%s %s\n%s\n%s\n%s %s" % (
+                             cds[cs_pos-6:cs_pos], cds[cs_pos:cs_pos+15],
+                             translate(cds[cs_pos-6:cs_pos+15]),
+                             translate(alt_cds[cs_pos-6:cs_pos+15]),
+                             alt_cds[cs_pos-6:cs_pos], alt_cds[cs_pos:cs_pos+15])
+                outputFile.write('\n')
+            except Exception, e:
+                print >> sys.stderr, "failed: %s" % e
+                print >> sys.stderr, "%s\t%d\t%s\t%s\t%d\t%f\t%s" % (tid,pos1,ref,alt,dp,freq,e)
     except Exception, e:
         print >> sys.stderr, "failed: %s" % e
         exit(1)