# HG changeset patch
# User rlegendre
# Date 1506502750 14400
# Node ID 01b945ba3697ce1814692164b5ad3d674f54d2fd
Uploaded
diff -r 000000000000 -r 01b945ba3697 ribo_tools/get_codon_frequency.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ribo_tools/get_codon_frequency.py Wed Sep 27 04:59:10 2017 -0400
@@ -0,0 +1,736 @@
+#!/usr/bin/env python2.7
+# -*- coding: utf-8 -*-
+
+'''
+ Created on sep. 2013
+ @author: rachel legendre
+ @copyright: rachel.legendre@igmors.u-psud.fr
+ @license: GPL v3
+'''
+
+from __future__ import division
+import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time
+import itertools
+from math import log10
+from decimal import Decimal
+from Bio import SeqIO
+from Bio.Seq import Seq
+from numpy import arange, std, array, linspace, average
+#from matplotlib import pyplot as pl
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as pl
+from matplotlib import font_manager
+from matplotlib import colors
+import csv
+from scipy import stats, errstate
+from collections import OrderedDict
+import ribo_functions
+import HTSeq
+# #libraries for debugg
+#import pdb
+import cPickle
+
+def stop_err(msg):
+ sys.stderr.write("%s\n" % msg)
+ sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
+ sys.exit()
+
+
+def init_codon_dict():
+
+ Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)])
+ return Codon_dict
+
+
+
+def get_codon_usage(bamfile, GFF, site, kmer, a_pos):
+ '''
+ Read GFF dict and get gene codon usage.
+ Return dict of codons usage
+ '''
+ try:
+ codon = init_codon_dict()
+ multi_tag = "XS:i:" ## bowtie Tag
+ tag = "IH:i:1" ## RUM tag
+
+ for feature in GFF :
+ if feature.type == 'gene' :
+ codon_dict = init_codon_dict()
+ chrom = feature.iv.chrom
+ start = feature.iv.start
+ stop = feature.iv.end
+ if start+50 < stop-50 :
+ region = chrom + ':' + str(start+50) + '-' + str(stop-50)
+ # #get all reads in this gene
+ reads = subprocess.check_output(["samtools", "view", bamfile, region])
+ head = subprocess.check_output(["samtools", "view", "-H", bamfile])
+ read_tab = reads.split('\n')
+ for read in read_tab:
+ # # search mapper for eliminate multiple alignements
+ if 'bowtie' in head:
+ multi_tag = "XS:i:"
+ elif 'bwa' in head:
+ multi_tag = "XT:A:R"
+ elif 'TopHat' in head:
+ tag = "NH:i:1"
+ else :
+ stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
+
+ if len(read) == 0:
+ continue
+ len_read = len(read.split('\t')[9])
+ # if it's read of good length
+ if len_read == kmer and (tag in read or multi_tag not in read):
+ feat = read.split('\t')
+ seq = feat[9]
+ # if it's a reverse read
+ if feat[1] == '16' :
+ if site == "A" :
+ # #get A-site
+ cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement())
+ elif site == "P" :
+ # #get P-site
+ cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement())
+ else :
+ # #get site-E
+ cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement())
+ # # test if it's a true codon not a CNG codon for example
+ if codon_dict.has_key(cod) :
+ codon_dict[cod] += 1
+ # if it's a forward read
+ elif feat[1] == '0' :
+ if site == "A" :
+ # #get A-site
+ cod = seq[a_pos:a_pos+3]
+ elif site == "P" :
+ # #get P-site
+ cod = seq[a_pos-3:a_pos]
+ else :
+ # #get site-E
+ cod = seq[a_pos-6:a_pos-3]
+ if codon_dict.has_key(cod) :
+ codon_dict[cod] += 1
+ del(read)
+ # # add in global dict
+ for cod, count in codon_dict.iteritems() :
+ codon[cod] += count
+ if sum(codon.values()) == 0 :
+ stop_err('There are no reads aligning on annotated genes in your GFF file')
+ else :
+ return codon
+
+ except Exception, e:
+ stop_err('Error during codon usage calcul: ' + str(e))
+
+
+
+
+'''
+http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html
+'''
+def violin_plot(ax, data, pos, bp=False):
+ '''
+ create violin plots on an axis
+ '''
+ dist = max(pos) - min(pos)
+ w = min(0.15 * max(dist, 1.0), 0.5)
+ for d, p in zip(data, pos):
+ k = stats.gaussian_kde(d) # calculates the kernel density
+ m = k.dataset.min() # lower bound of violin
+ M = k.dataset.max() # upper bound of violin
+ x = arange(m, M, (M - m) / 100.) # support for violin
+ v = k.evaluate(x) # violin profile (density curve)
+ v = v / v.max() * w # scaling the violin to the available space
+ ax.fill_betweenx(x, p, v + p, facecolor=color1, alpha=0.3)
+ ax.fill_betweenx(x, p, -v + p, facecolor=color2, alpha=0.3)
+ if bp:
+ ax.boxplot(data, notch=1, positions=pos, vert=1)
+
+
+
+'''
+http://log.ooz.ie/2013/02/matplotlib-comparative-histogram-recipe.html
+'''
+def comphist(x1, x2, orientation='vertical', **kwargs):
+ """Draw a comparative histogram."""
+ # Split keyword args:
+ kwargs1 = {}
+ kwargs2 = {}
+ kwcommon = {}
+ for arg in kwargs:
+ tgt_arg = arg[:-1]
+ if arg.endswith('1'):
+ arg_dict = kwargs1
+ elif arg.endswith('2'):
+ arg_dict = kwargs2
+ else:
+ arg_dict = kwcommon
+ tgt_arg = arg
+ arg_dict[tgt_arg] = kwargs[arg]
+ kwargs1.update(kwcommon)
+ kwargs2.update(kwcommon)
+
+ fig = pl.figure()
+
+ # Have both histograms share one axis.
+ if orientation == 'vertical':
+ ax1 = pl.subplot(211)
+ ax2 = pl.subplot(212, sharex=ax1)
+ # Flip the ax2 histogram horizontally.
+ ax2.set_ylim(ax1.get_ylim()[::-1])
+ pl.setp(ax1.get_xticklabels(), visible=False)
+ legend_loc = (1, 4)
+ else:
+ ax1 = pl.subplot(122)
+ ax2 = pl.subplot(121, sharey=ax1)
+ # Flip the ax2 histogram vertically.
+ ax2.set_xlim(ax2.get_xlim()[::-1])
+ pl.setp(ax1.get_yticklabels(), visible=False)
+ legend_loc = (1, 2)
+
+ ax1.hist(x1, orientation=orientation, **kwargs1)
+ ax2.hist(x2, orientation=orientation, **kwargs2)
+ ax2.set_ylim(ax1.get_ylim()[::-1])
+ ax1.legend(loc=legend_loc[0])
+ ax2.legend(loc=legend_loc[1])
+ # Tighten up the layout.
+ pl.subplots_adjust(wspace=0.0, hspace=0.0)
+ return fig
+
+
+def compute_FC_plot(cond1_norm, cond2_norm, cod_name, codon_to_test, dirout):
+
+ FC_tab = []
+ for z, y in zip(cond1_norm.itervalues(), cond2_norm.itervalues()):
+ fc = z - y
+ FC_tab.append(fc)
+ # #codon_to_test = ['TGA','TAG','TAA']
+
+ a = []
+ b = []
+ cod = []
+ for codon in cond1_norm.iterkeys():
+ if codon in codon_to_test :
+ fc = cond1_norm[codon] - cond2_norm[codon]
+ b.append(fc)
+ cod.append(codon)
+ else :
+ fc = cond1_norm[codon] - cond2_norm[codon]
+ a.append(fc)
+
+
+ fig = pl.figure(num=1)
+ comphist(array(a), array(b), label1='All codon', label2=cod_name, color2='green', bins=30, rwidth=1)
+ # pl.show()
+ pl.savefig(dirout + '/hist_codon_fc.png', format="png", dpi=340)
+ pl.clf()
+
+
+ # #violin plot
+ pos = range(2)
+ dat = array([array(a), array(b)])
+ fig = pl.figure()
+ pl.title("Distribution of codons FoldChange between two conditions")
+ ax = fig.add_subplot(1, 1, 1)
+ lab = array(['All codons', cod_name])
+ violin_plot(ax, dat, pos, bp=1)
+ for x, z in zip(dat, pos):
+ ax.plot(z, average(x), color='r', marker='*', markeredgecolor='r')
+ xtickNames = pl.setp(ax, xticklabels=lab)
+ pl.savefig(dirout + '/violinplot_codon.png', format="png", dpi=340)
+ pl.clf()
+
+ # (Fval,pval) = stats.ttest_ind(a, b, axis=0, equal_var=True)
+ (Fval, pval) = stats.mannwhitneyu(a, b)
+ return pval
+
+
+def get_aa_dict(cond1_norm, cond2_norm):
+
+ # ## create amino acid dictionnary:
+ AA = OrderedDict({})
+ AA['Phe'] = [cond1_norm['TTT'] + cond1_norm['TTC'], cond2_norm['TTT'] + cond2_norm['TTC']]
+ AA['Leu'] = [cond1_norm['TTA'] + cond1_norm['TTG'] + cond1_norm['CTT'] + cond1_norm['CTC'] + cond1_norm['CTA'] + cond1_norm['CTG'], cond2_norm['TTA'] + cond2_norm['TTG'] + cond2_norm['CTT'] + cond2_norm['CTC'] + cond2_norm['CTA'] + cond2_norm['CTG']]
+ AA['Ile'] = [cond1_norm['ATT'] + cond1_norm['ATC'] + cond1_norm['ATA'], cond2_norm['ATT'] + cond2_norm['ATC'] + cond2_norm['ATA']]
+ AA['Met'] = [cond1_norm['ATG'], cond2_norm['ATG']]
+ AA['Val'] = [cond1_norm['GTT'] + cond1_norm['GTC'] + cond1_norm['GTA'] + cond1_norm['GTG'] + cond1_norm['AGT'] + cond1_norm['AGC'], cond2_norm['GTT'] + cond2_norm['GTC'] + cond2_norm['GTA'] + cond2_norm['GTG'] + cond2_norm['AGT'] + cond2_norm['AGC']]
+ AA['Ser'] = [cond1_norm['TCT'] + cond1_norm['TCC'] + cond1_norm['TCA'] + cond1_norm['TCG'], cond2_norm['TCT'] + cond2_norm['TCC'] + cond2_norm['TCA'] + cond2_norm['TCG']]
+ AA['Pro'] = [cond1_norm['CCT'] + cond1_norm['CCC'] + cond1_norm['CCA'] + cond1_norm['CCG'], cond2_norm['CCT'] + cond2_norm['CCC'] + cond2_norm['CCA'] + cond2_norm['CCG']]
+ AA['Thr'] = [cond1_norm['ACT'] + cond1_norm['ACC'] + cond1_norm['ACA'] + cond1_norm['ACG'], cond2_norm['ACT'] + cond2_norm['ACC'] + cond2_norm['ACA'] + cond2_norm['ACG']]
+ AA['Ala'] = [cond1_norm['GCT'] + cond1_norm['GCC'] + cond1_norm['GCA'] + cond1_norm['GCG'], cond2_norm['GCT'] + cond2_norm['GCC'] + cond2_norm['GCA'] + cond2_norm['GCG']]
+ AA['Tyr'] = [cond1_norm['TAT'] + cond1_norm['TAC'], cond2_norm['TAT'] + cond2_norm['TAC']]
+ AA['Stop'] = [cond1_norm['TAA'] + cond1_norm['TAG'] + cond1_norm['TGA'], cond2_norm['TAA'] + cond2_norm['TAG'] + cond2_norm['TGA']]
+ AA['His'] = [cond1_norm['CAT'] + cond1_norm['CAC'], cond2_norm['CAT'] + cond2_norm['CAC']]
+ AA['Gln'] = [cond1_norm['CAA'] + cond1_norm['CAG'], cond2_norm['CAA'] + cond2_norm['CAG']]
+ AA['Asn'] = [cond1_norm['AAT'] + cond1_norm['AAC'], cond2_norm['AAT'] + cond2_norm['AAC']]
+ AA['Lys'] = [cond1_norm['AAA'] + cond1_norm['AAG'], cond2_norm['AAA'] + cond2_norm['AAG']]
+ AA['Asp'] = [cond1_norm['GAT'] + cond1_norm['GAC'], cond2_norm['GAT'] + cond2_norm['GAC']]
+ AA['Glu'] = [cond1_norm['GAA'] + cond1_norm['GAG'], cond2_norm['GAA'] + cond2_norm['GAG']]
+ AA['Cys'] = [cond1_norm['TGT'] + cond1_norm['TGC'], cond2_norm['TGT'] + cond2_norm['TGC']]
+ AA['Trp'] = [cond1_norm['TGG'], cond2_norm['TGG']]
+ AA['Arg'] = [cond1_norm['CGT'] + cond1_norm['CGC'] + cond1_norm['CGA'] + cond1_norm['CGG'] + cond1_norm['AGA'] + cond1_norm['AGG'], cond2_norm['CGT'] + cond2_norm['CGC'] + cond2_norm['CGA'] + cond2_norm['CGG'] + cond2_norm['AGA'] + cond2_norm['AGG']]
+ AA['Gly'] = [cond1_norm['GGT'] + cond1_norm['GGC'] + cond1_norm['GGA'] + cond1_norm['GGG'], cond2_norm['GGT'] + cond2_norm['GGC'] + cond2_norm['GGA'] + cond2_norm['GGG']]
+
+
+ return AA
+
+
+
+def plot_codon_usage(result, dirout, c1, c2, outfile, color1, color2):
+ '''
+ Take list of dict of codon usage and use matplotlib for do graph
+ '''
+
+ # #if there are replicat
+ if len(result) == 4 :
+ # store each dict in variables to make code more readable
+ cond1_1 = result[0].copy()
+ cond1_2 = result[1].copy()
+ cond2_1 = result[2].copy()
+ cond2_2 = result[3].copy()
+ # get codon order in one of list
+ codon_sorted = sorted(cond1_1.iterkeys(), reverse=False)
+ try:
+ # get max of each list
+ sum11 = sum(list(cond1_1.itervalues()))
+ sum12 = sum(list(cond1_2.itervalues()))
+ sum21 = sum(list(cond2_1.itervalues()))
+ sum22 = sum(list(cond2_2.itervalues()))
+ # for each codon, get values and sd in each condition
+ cond1_val = {}
+ cond1 = {}
+ cond2_val = {}
+ cond2 = {}
+ std_cond1 = []
+ std_cond2 = []
+ max_val = [] # # max value for graph
+ for i in codon_sorted:
+ # # cond1 = mean of replicats cond1 divided by max
+ cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
+ cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
+ # # standard deviation = absolute value of difference between replicats of cond1
+ std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
+ # # cond2 = mean of replicats cond1divided by max
+ cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
+ cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
+ # # standard deviation = absolute value of difference between replicats of cond2
+ std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
+ # # max value for each codon
+ max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2))
+
+ # for graph design
+ cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0]))
+ cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items())
+ cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0]))
+ cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items())
+ max_val = [x * 100 for x in max_val]
+ except ZeroDivisionError:
+ stop_err("Not enough reads to compute the codon occupancy")
+
+ AA = get_aa_dict(cond1_norm, cond2_norm)
+ max_valaa = []
+ cond1_aa = []
+ cond2_aa = []
+ aa_name = list(AA.iterkeys())
+ for z in AA.itervalues():
+ cond1_aa.append(z[0])
+ cond2_aa.append(z[1])
+ max_valaa.append(max(z))
+ # # plot amino acid profile :
+ fig = pl.figure(figsize=(15,10), num=1)
+ width = .50
+ ax = fig.add_subplot(111)
+ ax.xaxis.set_ticks([])
+ ind = arange(21)
+ pl.xlim(0, 21)
+ ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
+ ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
+ for x, y, z in zip(ind, max_valaa, aa_name):
+ ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
+ ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
+ ax.set_xlabel('Amino Acid')
+ handles, labels = ax.get_legend_handles_labels()
+ ax.legend(handles, labels)
+ pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
+ pl.clf()
+
+
+ # # compute theorical count in COND2
+ sum2 = (sum21 + sum22) / 2
+ cond2_count = []
+ for z in cond1_norm.itervalues() :
+ count = int(z * sum2 / 100)
+ cond2_count.append(count)
+
+ expected = array(cond2_count)
+ observed = array(list(cond2.itervalues()))
+
+ # write result
+ with open(outfile, 'w') as out :
+ out.write('Codon,Raw_' + c1 + ',Raw_' + c2 + ',Norm_' + c1 + ',Norm_' + c2 + ',FC(Mut/WT)\n')
+ for i in codon_sorted:
+ ## if global foldchange is equal to zero
+ if cond1_norm[i] == 0 and cond2_norm[i] == 0:
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',1.0\n')
+ elif cond1_norm[i] == 0 :
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',0.0\n')
+ else:
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
+ with errstate(all='ignore'):
+ chi = stats.chisquare(observed, expected)
+ out.write('Khi2 test\n')
+ out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
+
+
+
+ # plot result
+ fig = pl.figure(figsize=(20,10), num=1)
+ width = .40
+ ind = arange(len(codon_sorted))
+ ax = fig.add_subplot(111)
+ pl.xlim(0, len(codon_sorted) + 1)
+ ax.spines['right'].set_color('none')
+ ax.spines['top'].set_color('none')
+ ax.xaxis.set_ticks([])
+ ax.spines['left'].set_smart_bounds(True)
+ ax.yaxis.set_ticks_position('left')
+ ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, yerr=std_cond1, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c1)
+ ax.bar(ind + width, list(cond2_norm.itervalues()), width, yerr=std_cond2, facecolor=color2, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c2)
+ for x, y, z in zip(ind, max_val, codon_sorted):
+ ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)
+ ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
+ ax.set_xlabel('Codons')
+ handles, labels = ax.get_legend_handles_labels()
+ ax.legend(handles, labels)
+ pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
+ pl.clf()
+
+
+ elif len(result) == 2 :
+
+ # store each dict in OrderedDict sorted by key to make code more readable
+ cond1 = result[0]
+ cond2 = result[1]
+ cond1_norm = result[0].copy()
+ cond2_norm = result[1].copy()
+ # pdb.set_trace()
+ # get codon order in one of list
+ codon_sorted = sorted(cond1.iterkeys(), reverse=False)
+ try:
+ # get sum of each list
+ sum1 = sum(list(cond1.itervalues()))
+ sum2 = sum(list(cond2.itervalues()))
+ # #Normalize values by sum of each libraries
+ cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
+ cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
+ except ZeroDivisionError:
+ stop_err("Not enough reads to compute the codon occupancy. "+str(sum1)+" and "+str(sum2)+" reads are used for each condition, respectively.\n")
+
+ # # compute theorical count in COND2
+ cond2_count = []
+ for z in cond1_norm.itervalues() :
+ count = int(z * sum2 / 100.0)
+ cond2_count.append(count)
+
+ expected = array(cond2_count)
+ observed = array(list(cond2.itervalues()))
+
+ AA = get_aa_dict(cond1_norm, cond2_norm)
+
+ max_val = []
+ cond1_aa = []
+ cond2_aa = []
+ aa_name = list(AA.iterkeys())
+ for z in AA.itervalues():
+ cond1_aa.append(z[0])
+ cond2_aa.append(z[1])
+ max_val.append(max(z))
+
+ # # plot amino acid profile :
+ fig = pl.figure(figsize=(15,10), num=1)
+ width = .45
+ ax = fig.add_subplot(111)
+ ind = arange(21)
+ pl.xlim(0, 21)
+ #kwargs = {"hatch":'x'}
+ #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs)
+ #kwargs = {"hatch":'.'}
+ #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs)
+ ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
+ ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
+ #for x, y, z in zip(ind, max_val, aa_name):
+ # ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
+ axis_font = {'size':'10'}
+ pl.xticks(ind + width, aa_name,**axis_font)
+ ax.spines['right'].set_visible(False)
+ ax.spines['top'].set_visible(False)
+ ax.yaxis.set_ticks_position('left')
+ ax.xaxis.set_ticks_position('bottom')
+ #ax.xaxis.set_ticks([])
+ ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font)
+ ax.set_xlabel('Amino Acids', **axis_font)
+ handles, labels = ax.get_legend_handles_labels()
+ font_prop = font_manager.FontProperties(size=8)
+ ax.legend(handles, labels, prop=font_prop)
+ pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
+ pl.clf()
+
+ # write result
+ with open(outfile, 'w') as out :
+ out.write('Codon,Raw_' + c1 + ',Raw_' + c2 + ',Norm_' + c1 + ',Norm_' + c2 + ',FC(Mut/WT)\n')
+ for i in codon_sorted:
+ if cond1_norm[i] == 0 and cond2_norm[i] == 0:
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',1.0\n')
+ elif cond1_norm[i] == 0 :
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',0.0\n')
+ else:
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
+ out.write('Khi2 test\n')
+ with errstate(all='ignore'):
+ chi = stats.chisquare(observed, expected)
+ out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
+
+ # # get max value for each codon for histogram
+ max_val = [] # # max value for graph
+ for i in cond1:
+ # # max value for each codon
+ max_val.append(max(cond1_norm[i], cond2_norm[i]))
+
+ # plot result
+ fig = pl.figure(figsize=(20,10), num=1)
+ #fig = pl.figure(num=1)
+ width = .40
+ ind = arange(len(codon_sorted))
+ ax = fig.add_subplot(111)
+ pl.xlim(0, len(codon_sorted) + 1)
+ ax.spines['right'].set_color('none')
+ ax.spines['top'].set_color('none')
+ ax.xaxis.set_ticks([])
+ ax.spines['left'].set_smart_bounds(True)
+ ax.yaxis.set_ticks_position('left')
+ ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1)
+ ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2)
+ for x, y, z in zip(ind, max_val, codon_sorted):
+ ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)
+ ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
+ ax.set_xlabel('Codons')
+ handles, labels = ax.get_legend_handles_labels()
+ ax.legend(handles, labels)
+ pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
+ pl.clf()
+
+
+ else :
+ stop_err('Error running codon usage plotting : ' + str(e))
+
+
+ return (cond1_norm, cond2_norm, chi[1])
+
+def write_html_file(html, chi_pval, cond1, cond2):
+ try :
+
+
+ html_str = """
+
+
+
+
+
+
+
+
+
Global visualization
+
+
Visualization of density footprint in each codon.
If user has selected "Yes" for the replicate option the standard deviation between each replicate is plotted as an error bar in histogram.
+
+
+
+
Test for homogeneity distribution between each condition
+ H0 : %s and %s are same distribution
+ Khi2 test p-value: %s
+ If p-value less than 0.05, we can reject homogeneity distribution so we can hypothesize that distributions are not the same. Otherwise, we accept H0
+
+
+
+
Visualization of density footprint in each codon groupe by amino acid
+
+
+
+ """ % (cond1,cond2,chi_pval)
+
+
+ html_file = open(html, "w")
+ html_file.write(html_str)
+ html_file.close()
+
+ except Exception, e :
+ stop_err('Error during html page creation : ' + str(e))
+
+
+
+
+def check_codons_list (codons) :
+
+ for codon in codons :
+ if codon not in init_codon_dict().iterkeys() :
+ stop_err('Please to enter a valid codon : ' + codon + ' is not find\n')
+
+
+def check_index_bam (bamfile) :
+ # #testing indexed bam file
+ if os.path.isfile(bamfile + ".bai") :
+ pass
+ else :
+ cmd = "samtools index %s " % (bamfile)
+ proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE)
+ returncode = proc.wait()
+ # if returncode != 0:
+ # raise Exception
+
+def plot_fc (cond1, cond2, site, dirout):
+
+ fc = cond1.copy()
+
+ for key, value in fc.iteritems():
+ if cond1[key] == 0:
+ fc[key] = 1
+ else:
+ fc[key] = cond2[key]/cond1[key]
+
+ index = arange(len(fc.keys()))
+ label = fc.keys()
+ label = [w.replace('T','U') for w in label]
+ pl.figure(figsize=(15,10), num=1)
+ ax = pl.subplot(1,1,1)
+ pl.xticks([])
+ pl.scatter(index, fc.values(), color='b')
+ pl.axhline(y=1,color='r')
+ pl.xticks(index, label, rotation=90)
+ pl.ylabel('Foldchange of codon occupancy')
+ ax.yaxis.set_ticks_position('left')
+ ax.xaxis.set_ticks_position('bottom')
+ pl.ylim(-1,3)
+ pl.title(site+" site")
+ pl.savefig(dirout + '/fc_codons.png', format="png", dpi=340)
+
+
+def __main__():
+
+
+ # Parse command line options
+ parser = optparse.OptionParser()
+ parser.add_option("-g", "--gff", dest="gff", type="string",
+ help="gff file", metavar="FILE")
+
+ parser.add_option("-1", "--file1", dest="file1", type="string",
+ help="Bam Ribo-Seq alignments cond 1, if rep option, separate files by commas ", metavar="FILE")
+
+ parser.add_option("-2", "--file2", dest="file2", type="string",
+ help="Bam Ribo-Seq alignments cond 2, if rep option, separate files by commas", metavar="FILE")
+
+ parser.add_option("-c", "--cond1", dest="c1", type="string",
+ help="Name for first condition", metavar="STR")
+
+ parser.add_option("-C", "--cond2", dest="c2", type="string",
+ help="Name of second condition", metavar="STR")
+
+ parser.add_option("-k", "--kmer", dest="kmer", type="int", default = 28 ,
+ help="Length of your phasing reads", metavar="INT")
+
+# parser.add_option("-l", "--list", dest="list_cod", type= "string",
+# help="list of codons to compare to other", metavar="STR")
+
+ parser.add_option("-o", "--out", dest="outfile", type="string",
+ help="write report to FILE", metavar="FILE")
+
+ parser.add_option("-d", "--dirout", dest="dirout", type="string",
+ help="write report to PNG files", metavar="FILE")
+
+ parser.add_option("-a", "--asite", dest="asite", type="int", default = 15 ,
+ help="Off-set from the 5'end of the footprint to the A-site (default is 15)", metavar="INT")
+
+ parser.add_option("-s", "--site", dest="site", type="string", default = "A" ,
+ help="Script can compute in site A, P or E (default is A-site)", metavar="A|P|E")
+
+ parser.add_option("-r", "--rep", dest="rep", type="string", default = "no" ,
+ help="if replicate or not", metavar="yes|no")
+
+ parser.add_option("-x", "--hex_col1", dest="color1", type= "string", default = "SkyBlue" ,
+ help="Color for first condition", metavar="STR")
+
+ parser.add_option("-X", "--hex_col2", dest="color2", type= "string", default = "Plum" ,
+ help="Color for second condition", metavar="STR")
+
+ parser.add_option("-q", "--quiet",
+ action="store_false", dest="verbose", default=True,
+ help="don't print status messages to stdout")
+
+ (options, args) = parser.parse_args()
+ print "Begin codon frequency analysis at", time.asctime(time.localtime(time.time()))
+
+ try:
+ authorized_site = ["A", "P", "E"]
+ if options.site not in authorized_site :
+ stop_err(options.site + ' is not a authorized ribosome site')
+
+ ## Check if colors exist
+ if not colors.is_color_like(options.color1) :
+ stop_err( options.color1+' is not a proper color' )
+ if not colors.is_color_like(options.color2) :
+ stop_err( options.color2+' is not a proper color' )
+
+
+ #### NOT USE IN FINAL VERSION
+ # # get codon list
+ # codons = options.list_cod.upper().split(',')
+ # check_codons_list(codons)
+ GFF = HTSeq.GFF_Reader(options.gff)
+ # # get html file and directory :
+ (html, html_dir) = options.dirout.split(',')
+ if not os.path.exists(html_dir):
+ try:
+ os.mkdir(html_dir)
+ except Exception, e :
+ stop_err('Error running make directory : ' + str(e))
+ # #RUN analysis
+ # #If there are replicats
+ if options.rep == "yes" :
+ result = []
+ # split name of each file options by ","
+ cond1 = options.file1.split(',')
+ cond2 = options.file2.split(',')
+ # # calcul for each file
+ for fh in itertools.chain(cond1, cond2):
+ check_index_bam (fh)
+ result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite))
+ (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
+ # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
+ plot_fc (cond1, cond2, options.site, html_dir)
+
+ # #If there are no replicat
+ elif options.rep == "no" :
+ result = []
+ # #calcul for each cond
+ for fh in (options.file1, options.file2):
+ check_index_bam (fh)
+ result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
+ (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
+
+ # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
+ plot_fc (cond1, cond2, options.site, html_dir)
+ else :
+ sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time())))
+ sys.exit()
+
+ # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2)
+ write_html_file(html, chi_pval, options.c1, options.c2)
+
+ print "Finish codon frequency analysis at", time.asctime(time.localtime(time.time()))
+ except Exception, e:
+ stop_err('Error running codon frequency analysis (main program) : ' + str(e))
+
+
+if __name__=="__main__":
+ __main__()
diff -r 000000000000 -r 01b945ba3697 ribo_tools/get_codon_frequency.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ribo_tools/get_codon_frequency.xml Wed Sep 27 04:59:10 2017 -0400
@@ -0,0 +1,101 @@
+
+ To compare Ribo-seq alignments between two sets of conditions, to determine codon occupancy
+
+ samtools
+ matplotlib
+ numpy
+ scipy
+ Bio
+
+
+ get_codon_frequency.py
+ --gff=$annot
+ --rep=$replicat_opt['rep']
+ --cond1 $cond1
+ --cond2 $cond2
+ --kmer=$kmer
+ --asite=$asite
+ --out $out
+ --site $site
+ --hex_col1 $color1
+ --hex_col2 $color2
+ --dirout $html_file,$html_file.files_path
+ #if str( $replicat_opt['rep'] ) == 'yes':
+ --file1=$file1,$file11
+ --file2=$file2,$file22
+ #else
+ --file1=$file1
+ --file2=$file2
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+ ## Use conditional balise : if rep =yes : 4 files, else 2 files
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Summary
+-------
+This tool uses Ribo-seq (BAM file) to determine whether codon occupancy differs between two sets of conditions. For each footprint, the codons at the chosen site are recorded and a histogram displaying all the normalised codon numbers is plotted for both sets of conditions. A second histogram groups together all the codons corresponding to a given amino acid. A chi-squared test is then carried out to determine whether the distribution of the codons used is the same in both sets of conditions.
+
+
+Output
+-------
+This tool provides an html file containing graphs and a statistical test result. An additional csv file with codon numbers is provided.
+
+
+Dependances
+------------
+
+.. class:: warningmark
+
+This tool depends on Python (>=2.7) and following packages : numpy 1.8.0, Biopython 1.58, scipy 0.12.0 and matplotlib 1.3.1. Samtools is used for bam manipulation.
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 01b945ba3697 ribo_tools/kmer_analysis.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ribo_tools/kmer_analysis.py Wed Sep 27 04:59:10 2017 -0400
@@ -0,0 +1,464 @@
+#!/usr/bin/env python2.7.3
+#-*- coding: utf-8 -*-
+'''
+ Created on Jun. 2014
+ @author: rachel legendre
+ @copyright: rachel.legendre@igmors.u-psud.fr
+ @license: GPL v3
+'''
+
+import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands
+import pysam
+#from matplotlib import pyplot as pl
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as pl
+from numpy import arange
+from collections import OrderedDict
+import ribo_functions
+#import cPickle
+## suppress matplotlib warnings
+import warnings
+warnings.filterwarnings('ignore')
+
+
+total_mapped_read = 0
+
+
+def stop_err( msg ):
+ sys.stderr.write( "%s\n" % msg )
+ sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
+ sys.exit()
+
+
+def split_bam(bamfile,tmpdir):
+ '''
+ split bam by chromosome and write sam file in tmpdir
+ '''
+ try:
+ #get header
+ results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
+ header = results.split('\n')
+
+ #define genome size
+ genome = []
+ for line in header:
+ result = re.search('SN', line)
+ if result :
+ #print line
+ feat = line.split('\t')
+ chrom = re.split(":", feat[1])
+ #print feat[1]
+ genome.append(chrom[1])
+
+ #split sam by chrom
+ n = 0
+ for chrm in genome:
+ with open(tmpdir+'/'+chrm+'.sam', 'w') as f :
+ #write header correctly for each chromosome
+ f.write(header[0]+'\n')
+ expr = re.compile(chrm+'\t')
+ el =[elem for elem in header if expr.search(elem)][0]
+ f.write(el+'\n')
+ f.write(header[-2]+'\n')
+ #write all reads for each chromosome
+ reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
+ f.write(reads)
+ # calculate number of reads
+ n += reads.count(chrm)
+
+ sys.stdout.write("%d reads are presents in your bam file\n" % n)
+
+ except Exception, e:
+ stop_err( 'Error during bam file splitting : ' + str( e ) )
+
+
+
+def get_first_base(tmpdir, kmer):
+ '''
+ write footprint coverage file for each sam file in tmpdir and get kmer distribution
+ '''
+ global total_mapped_read
+
+ ## tags by default
+ multi_tag = "XS:i:"
+ tag = "IH:i:1"
+
+ ## init kmer dict
+ KMER = OrderedDict({})
+
+ try:
+ file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
+ ##write coverage for each sam file in tmpdir
+ for samfile in file_array:
+ with open(tmpdir+'/'+samfile, 'r') as sam :
+ ##get chromosome name
+ chrom = samfile.split(".sam")[0]
+
+ for line in sam:
+ #initialize dictionnary
+ if '@SQ\tSN:' in line :
+ size = int(line.split('LN:')[1])
+ genomeF = [0]*size
+ genomeR = [0]*size
+ # define multiple reads keys from mapper
+ elif '@PG\tID' in line :
+ if 'bowtie' in line:
+ multi_tag = "XS:i:"
+ elif 'bwa' in line:
+ multi_tag = "XT:A:R"
+ elif 'TopHat' in line:
+ tag = "NH:i:1"
+ else :
+ stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
+
+ # get footprint
+ elif re.search('^[^@].+', line) :
+ len_read = len(line.split('\t')[9])
+ ##full kmer dict
+ if KMER.has_key(len_read):
+ KMER[len_read] += 1
+ else :
+ KMER[len_read] = 1
+
+ #print line.rstrip()
+ read_pos = int(line.split('\t')[3])
+ read_sens = int(line.split('\t')[1])
+ #len_read = len(line.split('\t')[9])
+ if len_read == kmer and (tag in line or multi_tag not in line):
+ ###if line.split('\t')[5] == '28M' :
+ total_mapped_read +=1
+ #if it's a forward read
+ if read_sens == 0 :
+ #get 5' base
+ genomeF[read_pos] += 1
+ #if it's a reverse read
+ elif read_sens == 16 :
+ #get 5' base
+ read_pos += (len_read-1)
+ genomeR[read_pos] += 1
+
+ #try:
+ #write coverage in files
+ with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
+ for i in range(0,size):
+ cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
+ #except Exception,e:
+ # stop_err( 'Error during coverage file writting : ' + str( e ) )
+
+ #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read)
+
+ return KMER
+
+ except Exception, e:
+ stop_err( 'Error during footprinting : ' + str( e ) )
+
+
+def __percent__(prop):
+
+ if sum(prop) != 0 :
+ perc = [0,0,0]
+ if prop[0] != 0 :
+ perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
+ if prop[1] != 0 :
+ perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
+ if prop[2] != 0 :
+ perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
+ return perc
+ else :
+ return prop
+
+
+def frame_analysis(tmpdir,GFF):
+ '''
+ This function take footprint and annotation (gff) for analyse reading frame in each gene
+ '''
+ global total_mapped_read
+ try:
+ chrom = "" # initializing chromosome
+ nb_gene = 0 # number of analysed genes
+ whole_phasing = [0,0,0]
+ for gene in GFF['order']:
+ ## maybe no start position in GTF file so we must to check and replace
+ exon_number = GFF[gene]['exon_number']
+ try : GFF[gene]['start']
+ except :
+ if GFF[gene]['strand'] == '+' :
+ GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
+ else :
+ GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
+ ## also for stop coordinates
+ try : GFF[gene]['stop']
+ except :
+ if GFF[gene]['strand'] == '+' :
+ GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
+
+ else :
+ GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
+ cov = []
+ ##first chromosome : we open corresponding file
+ try:
+ if chrom == "" :
+ chrom = GFF[gene]['chrom']
+ with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+ data = f.readlines()
+ ##if we change chromosome
+ elif chrom != GFF[gene]['chrom'] :
+ chrom = GFF[gene]['chrom']
+ with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+ data = f.readlines()
+ except IOError :
+ print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"
+
+ try:
+ ## if a gene without intron :
+ if GFF[gene]['exon_number'] == 1:
+
+ ## get coverage for each gene
+ if GFF[gene]['strand'] == "+":
+ for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+ cov.append(int((data[i].rstrip()).split("\t")[0]))
+ else :
+ for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+ cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+ cov.reverse()
+ else :
+ ## For each gene, get coverage and sum of exon size
+ if GFF[gene]['strand'] == "+":
+
+ for exon in range(1,GFF[gene]['exon_number']+1) :
+ for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+ #if i <= GFF[gene]['stop'] :
+ cov.append(int((data[i].rstrip()).split("\t")[0]))
+ else :
+
+ for exon in range(1,GFF[gene]['exon_number']+1) :
+ for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+ #if i <= GFF[gene]['start'] :
+ cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+ cov.reverse()
+ except :
+ #print gene+" could not be analysed."
+ #del GFF[gene]
+ continue
+ len_cov = len(cov)
+ prop = [0,0,0]
+ for nuc in range(0,len_cov-2,3) :
+ ## Calculate proportion
+ prop[0] += cov[nuc]
+ prop[1] += cov[nuc+1]
+ prop[2] += cov[nuc+2]
+ whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
+
+ whole_phasing = __percent__(whole_phasing)
+ #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
+ return whole_phasing
+
+ except Exception, e:
+ stop_err( 'Error during frame analysis : ' + str( e ) )
+
+
+
+def make_report(html_file, dirout, kmer, results) :
+
+ array = sorted(kmer.items(), key=lambda x: x[0])
+ values = []
+ label = []
+ for x,y in array :
+ values.append(y)
+ label.append(x)
+ index = arange(len(label))
+ bar_width = 0.35
+ axis_font = {'size':'10'}
+ fig, ax = pl.subplots()
+ pl.bar(index ,values, color='LightsteelBlue')
+ pl.xlabel('kmer value', **axis_font)
+ pl.ylabel('Number of reads', **axis_font)
+ pl.title('Number of reads for each k-mer')
+ pl.xticks(index + bar_width, label, **axis_font)
+ #pl.show()
+ fig.subplots_adjust()
+ pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
+ pl.clf()
+
+ for key, phase in results.iteritems() :
+ fig = pl.figure(num=1)
+ frame = phase
+ index = arange(3)
+ bar_width = 0.5
+ pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
+ pl.xlabel('Frame in gene', **axis_font)
+ pl.ylabel('Percent of read', **axis_font)
+ pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
+ pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
+ #pl.tight_layout()
+ pl.ylim(0,100)
+ fig.subplots_adjust()
+ pl.draw()
+ pl.show()
+ pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
+ pl.clf()
+
+ kmer_summary = ''
+ kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
+ for key, number in kmer_sorted.iteritems() :
+ if number > 100 :
+ kmer_summary += '
'
+ kmer_summary += 'Analysis of '+str(key)+'-mer : '
+ kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : '
+ kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+' '
+ kmer_summary += ' '
+ kmer_summary += '
Look inside the img folder to find close.png, loading.gif, prev.png, and next.png. These files are used in lightbox.css. By default, lightbox.css will look for these images in a folder called img.
+
+
Part 2 - Turn it on
+
+
Add a data-lightbox attribute to any image link to activate Lightbox. For the value of the attribute, use a unique name for each image. For example:
+
Optional: Set the title attribute if you want to show a caption.
+
If you have a group of related images that you would like to combine into a set, use the same data-lightbox attribute value for all of the images. For example:
+
For you long time Lightbox users, don't fret, you can still enable Lightbox by using rel="lightbox". The new data-lightbox approach is preferred though as it is valid html.
+
+
+
+
+
+
+
+
Help
+
+
Bugs
+
If you find a bug, create an issue on Github. Include your operating system and browser version along with detailed steps to reproduce the bug.
+
+
Feature requests
+
If you want a feature added, please create an issue on Github. Someone else or I might be able to help out. No guarantees.
+
+
Support questions
+
If you have a question that is not a bug or a feature request, your best chance of getting an answer is by following these steps:
+
+
Search the Lightbox forums. You will not be able to post new questions in the forum as posting has been disabled. The forum is having spam problems and the forum will eventually be phased out.
a",n=d.getElementsByTagName("*")||[],r=d.getElementsByTagName("a")[0],!r||!r.style||!n.length)return t;s=a.createElement("select"),u=s.appendChild(a.createElement("option")),o=d.getElementsByTagName("input")[0],r.style.cssText="top:1px;float:left;opacity:.5",t.getSetAttribute="t"!==d.className,t.leadingWhitespace=3===d.firstChild.nodeType,t.tbody=!d.getElementsByTagName("tbody").length,t.htmlSerialize=!!d.getElementsByTagName("link").length,t.style=/top/.test(r.getAttribute("style")),t.hrefNormalized="/a"===r.getAttribute("href"),t.opacity=/^0.5/.test(r.style.opacity),t.cssFloat=!!r.style.cssFloat,t.checkOn=!!o.value,t.optSelected=u.selected,t.enctype=!!a.createElement("form").enctype,t.html5Clone="<:nav>"!==a.createElement("nav").cloneNode(!0).outerHTML,t.inlineBlockNeedsLayout=!1,t.shrinkWrapBlocks=!1,t.pixelPosition=!1,t.deleteExpando=!0,t.noCloneEvent=!0,t.reliableMarginRight=!0,t.boxSizingReliable=!0,o.checked=!0,t.noCloneChecked=o.cloneNode(!0).checked,s.disabled=!0,t.optDisabled=!u.disabled;try{delete d.test}catch(h){t.deleteExpando=!1}o=a.createElement("input"),o.setAttribute("value",""),t.input=""===o.getAttribute("value"),o.value="t",o.setAttribute("type","radio"),t.radioValue="t"===o.value,o.setAttribute("checked","t"),o.setAttribute("name","t"),l=a.createDocumentFragment(),l.appendChild(o),t.appendChecked=o.checked,t.checkClone=l.cloneNode(!0).cloneNode(!0).lastChild.checked,d.attachEvent&&(d.attachEvent("onclick",function(){t.noCloneEvent=!1}),d.cloneNode(!0).click());for(f in{submit:!0,change:!0,focusin:!0})d.setAttribute(c="on"+f,"t"),t[f+"Bubbles"]=c in e||d.attributes[c].expando===!1;d.style.backgroundClip="content-box",d.cloneNode(!0).style.backgroundClip="",t.clearCloneStyle="content-box"===d.style.backgroundClip;for(f in x(t))break;return t.ownLast="0"!==f,x(function(){var n,r,o,s="padding:0;margin:0;border:0;display:block;box-sizing:content-box;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;",l=a.getElementsByTagName("body")[0];l&&(n=a.createElement("div"),n.style.cssText="border:0;width:0;height:0;position:absolute;top:0;left:-9999px;margin-top:1px",l.appendChild(n).appendChild(d),d.innerHTML="
+ %s
+
+ """ % (gene_table)
+
+ html_file = open(html,"w")
+ html_file.write(html_str)
+ html_file.close()
+ html_index = open(subfolder+'/index.html','w')
+ html_index.write(html_str)
+ html_index.close()
+
+
+ except Exception, e :
+ stop_err('Error during html page creation : ' + str( e ) )
+
+
+
+def __main__():
+
+ #Parse command line options
+ parser = optparse.OptionParser()
+ parser.add_option("-g", "--gff", dest="gff", type= "string",
+ help="GFF annotation file", metavar="FILE")
+
+ parser.add_option("-b", "--bam", dest="bamfile", type= "string",
+ help="Bam Ribo-Seq alignments ", metavar="FILE")
+
+ parser.add_option("-f", "--fasta", dest="fasta", type= "string",
+ help="Fasta file ", metavar="FILE")
+
+ parser.add_option("-d", "--dirout", dest="dirout", type= "string",
+ help="write report in this html file and in associated directory", metavar="STR,STR")
+
+ parser.add_option("-x", "--boxplot", dest="box", type= "string",
+ help="report boxplot in this file", metavar="STR")
+
+ parser.add_option("-c", "--cutoff", dest="cutoff", type= "int", default = 60 ,
+ help="Integer value for selecting all genes that have less than 60 % (default) of reads in coding frame ", metavar="INT")
+
+ parser.add_option("-k", "--kmer", dest="kmer", type= "int",default = 28 ,
+ help="Longer of your phasing reads (Default is 28)", metavar="INT")
+
+ parser.add_option("-o", "--orf_length", dest="orf_length", type= "int", default = 300 ,
+ help="Mean of ORF's length for segmentation (Default is 300pb, recommended for yeast)", metavar="INT")
+
+ parser.add_option("-p", "--frame", dest="frame", type="int", default = 1 ,
+ help="Script can compute in frame 1, 2 or 3", metavar="1|2|3")
+
+ parser.add_option("-q", "--quiet",
+ action="store_false", dest="verbose", default=True,
+ help="don't print status messages to stdout")
+
+ (options, args) = parser.parse_args()
+ sys.stdout.write("Begin frame analysis at %s\n" % time.asctime( time.localtime(time.time())))
+ # Create temp dir
+ try:
+ tmp_dir = tempfile.mkdtemp()
+ (html_file, subfolder ) = options.dirout.split(",")
+
+ ##testing indexed bam file
+ if os.path.isfile(options.bamfile+".bai") :
+ pass
+ else :
+ cmd = "samtools index %s " % (options.bamfile)
+ proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
+ returncode = proc.wait()
+
+ if not os.path.exists(subfolder):
+ try:
+ os.mkdir(subfolder)
+ except Exception, e :
+ stop_err('Error running make directory : ' + str(e))
+
+ ## check frame arg
+ if options.frame not in [1,2,3]:
+ stop_err( 'Please enter a good value for frame argument : must be 1, 2 or 3' )
+
+ ##RUN analysis
+ split_bam(options.bamfile,tmp_dir)
+ get_first_base (tmp_dir,options.kmer,options.frame)
+ ## identify GFF or GTF format from 9th column
+ with open (options.gff,"r") as gffile :
+ for line in gffile :
+ if '#' in line :
+ ## skip header
+ gffile.next()
+ elif 'gene_id' in line :
+ ## launch gtf reader :
+ GFF = ribo_functions.store_gtf(options.gff)
+ break
+ elif 'ID=' in line :
+ ## launch gff reader
+ GFF = ribo_functions.store_gff(options.gff)
+ break
+ else :
+ stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
+
+ ## check gff reading
+ if not GFF['order'] :
+ stop_err( 'Incorrect GFF file')
+ for gene in GFF['order']:
+ if not GFF[gene]['exon'] :
+ del GFF[gene]
+ GFF['order'].remove(gene)
+
+ #### cPickles for Test ####
+ #if os.path.isfile("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict") :
+ # with open("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict",'rb') as fp:
+ # GFF = cPickle.load(fp)
+ #else :
+ # split_bam(options.bamfile,tmp_dir)
+ # get_first_base (tmp_dir)
+ # GFF = store_gff(options.gfffile)
+ # GFF = frame_analysis (tmp_dir,options.outfile,options.cutoff,GFF,subfolder)
+ # with open("/home/rlegendre/OutOfFrameWithIntron/pyt_dict",'wb') as fp:
+ # cPickle.dump(GFF,fp)
+ #
+ #### cPickles for Test ####
+
+ GFF = frame_analysis (tmp_dir,subfolder,options.cutoff,GFF, options.box, options.orf_length)
+ plot_subprofile (GFF,subfolder,options.fasta)
+ write_html_page(html_file,subfolder,GFF)
+
+ ##paste jquery script in result directory :
+ jq_src = os.path.join(os.path.dirname(__file__),'lightbox')
+ shutil.copytree(jq_src,os.path.join(subfolder,'lightbox'))
+
+
+
+ sys.stdout.write("Finish frame analysis at %s\n" % time.asctime( time.localtime(time.time())))
+ except Exception, e:
+ stop_err( 'Error running metagene analysis : ' + str( e ) )
+
+ if os.path.exists( tmp_dir ):
+ shutil.rmtree( tmp_dir )
+
+
+
+if __name__=="__main__":
+ __main__()
\ No newline at end of file
diff -r 000000000000 -r 01b945ba3697 ribo_tools/metagene_frameshift_analysis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ribo_tools/metagene_frameshift_analysis.xml Wed Sep 27 04:59:10 2017 -0400
@@ -0,0 +1,77 @@
+
+ To analyse Ribo-seq alignments for the extraction of translational ambiguities
+
+ samtools
+ matplotlib
+ numpy
+ PIL
+ Bio
+
+
+ metagene_frameshift_analysis.py --gff $reference --bam $mapping --cutoff $cutoff --kmer $kmer --fasta $fasta --dirout $output,$output.files_path --box $boxplot --orf_length $orf --frame $frame > $log
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Summary
+-------
+This tool uses Ribo-seq data (BAM file) to extract out-of-frame footprints in all genes from a reference annotation file (`GFF3`_). Subprofiles are plotted for each gene with dual coding events.
+
+
+*- GFF3 file*: This file must have nine tabulated-delimited columns: Chromosome, source, feature, start, stop, score, strand, phasing, note. The gene ID is retrieved from the note field, using the "ID=" tag.
+
+*- Fasta file*: Reference fasta file. Care should be taken with the chromosome nomenclature used, which must be compatible with the GFF3 annotation file.
+
+*- BAM file*: This file should be sorted and may contain either multiple or unaligned footprints
+
+*- Kmer*: Length of the best-phased footprints. It can be calculated by running kmer_analysis
+
+*- Frame*: Frame for which the phasing of the footprints is best. It can be calculated by running kmer_analysis
+
+*- Cutoff*: An integer value for selecting all genes for which fewer than 60 % (default) of the footprints are in the coding frame.
+
+*- Orf*: Approximate size of the segment.
+
+
+.. _GFF3: http://gmod.org/wiki/GFF3
+
+
+Output
+-------
+This tool generates 3 output files:
+
+*- html file*: for the detection and visualisation of translational ambiguities.
+
+*- Stat file*: this file provides statistics for the treated footprints and phasing.
+
+*- Boxplot*: Proportion of footprints in the three frames, for all genes.
+
+
+Dependances
+------------
+
+.. class:: warningmark
+
+This tool depends on Python (>=2.7) and following packages : numpy 1.8.0, Biopython 1.58, matplotlib 1.3.1. Samtools is used for bam manipulation.
+
+
+
+
+
diff -r 000000000000 -r 01b945ba3697 ribo_tools/metagene_readthrough.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ribo_tools/metagene_readthrough.py Wed Sep 27 04:59:10 2017 -0400
@@ -0,0 +1,1056 @@
+#!/usr/bin/env python2.7.3
+#-*- coding: utf-8 -*-
+
+'''
+ Created on Dec. 2013
+ @author: rachel legendre
+ @copyright: rachel.legendre@igmors.u-psud.fr
+ @license: GPL v3
+'''
+
+import os, sys, time, optparse, shutil, re, urllib, subprocess, tempfile
+from urllib import unquote
+from Bio import SeqIO
+import csv
+import pysam
+import HTSeq
+#from matplotlib import pyplot as pl
+import matplotlib
+from jinja2.nodes import Pos
+matplotlib.use('Agg')
+import matplotlib.pyplot as pl
+from numpy import arange
+from matplotlib import ticker as t
+from PIL import Image
+import ribo_functions
+## suppress matplotlib warnings
+import warnings
+
+
+
+def stop_err( msg ):
+ sys.stderr.write( "%s\n" % msg )
+ sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
+ sys.exit()
+
+
+def compute_rpkm(length,count_gene,count_tot) :
+
+ try :
+ rpkm = "{0:.4f}".format(count_gene*1000000.0/(count_tot*length))
+ except ArithmeticError :
+ stop_err( 'Illegal division by zero during computing RPKM')
+ return float(rpkm)
+
+
+def find_stop(seq) :
+ '''
+ Find stop codon in a sequence and return position of first nucleotide in stop
+ '''
+ stop_codon = ['TAA','TAG','TGA']
+ for nt in range(0,len(seq)-3,3) :
+ codon = seq[nt:nt+3]
+ if codon in stop_codon :
+ return nt
+
+ return -1
+
+
+def check_met(seq):
+ '''
+ Boolean function for testing presence or absence of methionine in 5 codons following stop codon
+ '''
+ met = 'ATG'
+ for pos in range(0,15,3) :
+ codon = seq[pos:pos+3]
+ if codon in met :
+ return True
+
+ return False
+
+
+'''
+ feature.iv is a GenomicInterval object :
+ A GenomicInterval object has the following slots, some of which
+ are calculated from the other:
+
+ chrom: The name of a sequence (i.e., chromosome, contig, or
+ the like).
+ start: The start of the interval. Even on the reverse strand,
+ this is always the smaller of the two values 'start' and 'end'.
+ Note that all positions should be given as 0-based value!
+ end: The end of the interval. Following Python convention for
+ ranges, this in one more than the coordinate of the last base
+ that is considered part of the sequence.
+ strand: The strand, as a single character, '+' or '-'. '.' indicates
+ that the strand is irrelevant. (Alternatively, pass a Strand object.)
+ length: The length of the interval, i.e., end - start
+ start_d: The "directional start" position. This is the position of the
+ first base of the interval, taking the strand into account. Hence,
+ this is the same as 'start' except when strand == '-', in which
+ case it is end-1.
+ end_d: The "directional end": Usually, the same as 'end', but for
+ strand=='-1', it is start-1.
+
+'''
+def check_overlapping(gff_reader,chrm,start,stop,strand, name):
+
+ #### probleme avec les genes completement inclu...
+
+ iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand)
+ for feature in gff_reader:
+ if feature.type == "gene" :
+ if feature.iv.overlaps(iv2) and feature.attr.get('gene_name') != name :
+ ## if its a reverse gene, we replace start of extension by start of previous gene
+ if strand == '-' :
+ return (feature.iv.end+3,stop)
+ ## else we replace stop of extension by start of following gene
+ else :
+ return (start,feature.iv.start-3)
+ ## if no overlap are find, we return -1
+ return (start,stop)
+
+
+def pass_length(start,stop) :
+
+ if (stop-start) > 25 :
+ return True
+ else :
+ return False
+
+
+def check_homo_coverage(gene,GFF,start,stop, aln) :
+
+ chrom = GFF[gene]['chrom']
+ ## compute number of nucleotides in CDS with a coverage equal to zero
+ nt_cds_cov = 0
+ nt_cds_num = 0
+ for i in range(1,GFF[gene]['exon_number']+1) :
+ for z in range(GFF[gene]['exon'][i]['start'],GFF[gene]['exon'][i]['stop']):
+ nt_cds_num += 1
+ if aln.count(chrom,z,z+1) == 0 :
+ nt_cds_cov += 1
+
+ ## compute percent of CDS no covering
+ if nt_cds_cov == 0 or nt_cds_num == 0:
+ percent = 0
+ else:
+ percent = nt_cds_cov*100/nt_cds_num
+
+ ## compute number of nucleotides with no coverage in extension
+ nt_no_cover = 0
+ for pos in range(start,stop,1) :
+ if aln.count(chrom,pos,pos+1) == 0 :
+ nt_no_cover += 1
+ #print gene, nt_cds_cov, percent, nt_no_cover
+ #percent10 = (stop-start)*50/100
+ if (nt_no_cover*100)/(stop-start) > percent :
+ return False
+ else :
+ return True
+
+def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) :
+
+
+ ### ignore matplotlib warnings for galaxy
+ warnings.filterwarnings('ignore')
+ try:
+ strand = gene['strand']
+ len_gene = gene['stop']-gene['start']
+ if strand is "-" :
+ len_ext = gene['stop']-start_extension
+ ## coverage in all gene + extension
+ start = start_extension-100
+ stop = gene['stop']+100
+ vector1 = [0]*(stop-start)
+ #for pileupcolumn in aln.pileup( gene['chrom'], start, stop):
+ # vector.append(pileupcolumn.n)
+ for read in aln.fetch(gene['chrom'], start, stop):
+ if read.is_reverse :
+ ## for get footprint in P-site (estimate) we take 3 nt in middle of read
+ #pos = (read.pos+13)-start
+ pos = (read.pos-start) + (read.rlen/2)-1
+ for z in range(pos,(pos+3)) :
+ ## le fetch contient des reads qui chevauchent les 30 nt de la fin du gene mais dont le site P
+ ## se trouve avant notre vector, le z devient negatif et la couverture augmente en fin de vecteur (ce qui est faux)
+ if z > 0 :
+ try :
+ vector1[z] += 1
+ except IndexError :
+ pass
+ vector1.reverse()
+ mean_read = float(sum(vector1))/float(len(vector1))
+ cov = [(x/mean_read) for x in vector1]
+ idx_tot = arange(len(cov))
+ ## coverage in extension
+ start_ext = start_extension-40
+ stop_ext = stop_extension+30
+ vector2 = [0]*(stop_ext-start_ext)
+ #for pileupcolumn in aln.pileup( gene['chrom'], start, stop):
+ # vector.append(pileupcolumn.n)
+ for read in aln.fetch(gene['chrom'], start_extension, stop_ext):
+ if read.is_reverse :
+ ## get footprint in P-site
+ #pos = (read.pos+13)-start_ext
+ pos = (read.pos-start_ext) + (read.rlen/2)-1
+ for z in range(pos,(pos+3)) :
+ if z > 0 :
+ try :
+ vector2[z] += 1
+ except IndexError :
+ pass
+ vector2.reverse()
+ #mean_read = float(sum(vector))/float(len(vector))
+ cov_ext = [(x/mean_read) for x in vector2]
+ _max = max(cov_ext[30::])
+ idx_ext = arange(len(cov_ext))
+
+ else :
+ len_ext = stop_extension-gene['start']
+ start = gene['start']-100
+ stop = stop_extension+100
+ vector = [0]*(stop-start)
+ #for pileupcolumn in aln.pileup( gene['chrom'], start, stop):
+ #vector.append(pileupcolumn.n)
+ for read in aln.fetch(gene['chrom'], start, stop):
+ if not read.is_reverse :
+ ## get footprint in P-site
+ #pos = (read.pos+12)-start
+ pos = (read.pos-start) + (read.rlen/2)-1
+ for z in range(pos,(pos+3)) :
+ if z > 0 :
+ try :
+ vector[z] += 1
+ except IndexError :
+ pass
+ mean_read = float(sum(vector))/float(len(vector))
+ cov = [(x/mean_read) for x in vector]
+ idx_tot = arange(len(cov))
+
+ ## coverage in extension
+ start_ext = gene['stop']-30
+ stop_ext = stop_extension+40
+ vector = [0]*(stop-start_ext)
+ for read in aln.fetch(gene['chrom'], start_ext, stop_extension):
+ if not read.is_reverse :
+ ## get footprint in P-site
+ pos = (read.pos-start_ext) + (read.rlen/2)-1
+ for z in range(pos,(pos+3)) :
+ if z > 0 :
+ try :
+ vector[z] += 1
+ except IndexError :
+ pass
+
+ cov_ext = [(x/mean_read) for x in vector]
+ idx_ext = arange(len(cov_ext))
+ _max = max(cov_ext[30::])
+
+ #### PLOT FIGURE ####
+
+ font = {'family' : 'serif','color': 'grey','weight' : 'normal','size' : 16 }
+
+
+ fig = pl.figure(num=1)
+ ## create a big subplot for common y axis on two last subplot
+ ax = fig.add_subplot(2,1,2)
+ ## hide all spines
+ ax.spines['top'].set_visible(False)
+ ax.spines['bottom'].set_visible(False)
+ ax.spines['left'].set_visible(False)
+ ax.spines['right'].set_visible(False)
+ ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off')
+
+ ## plot gene structure
+ ax1 = fig.add_subplot(3,1,1)
+ ax1.set_title(gene['name'])
+ ## hide all spines
+ ax1.spines['right'].set_color('none')
+ ax1.spines['top'].set_color('none')
+ ax1.spines['left'].set_color('none')
+ ax1.spines['bottom'].set_color('none')
+ ax1.set_ylim(0,5)
+ #set xlim as second plot with 100nt switch
+ ax1.set_xlim(-100,len(idx_tot)-100)
+ ## hide ticks
+ pl.yticks([])
+ pl.xticks([])
+ ## if it's a "simple" gene
+ if gene['exon_number'] == 1 :
+ exon = arange(len_gene)
+ x = [3]*len_gene
+ ax1.plot(exon,x,color='#9B9E9C')
+ ax1.fill_between(exon,2,x,color='#9B9E9C')
+ ## if it's a gene with introns
+ else :
+ ## plot a line representing intron
+ intron = arange(len_gene)
+ y = [2.5]*len_gene
+ ax1.plot(intron,y,color='#9B9E9C')
+
+ ## plotting each intron
+ start = gene['start']
+ if strand == '+' :
+ for exon in range(1,gene['exon_number']+1,1) :
+ len_exon = gene['exon'][exon]['stop']-gene['exon'][exon]['start']
+ idx = gene['exon'][exon]['start'] - start
+ exo = arange(idx,idx+len_exon)
+ x = [3]*len_exon
+ ax1.plot(exo,x,color='#9B9E9C')
+ ax1.fill_between(exo,2,x,color='#9B9E9C')
+ else :
+ ## if it's a reverse gene we must reverse exon's position
+ start = gene['start']
+ tabF = [2.5]*len_gene
+ tabR = [2.5]*len_gene
+ for exon in range(1,gene['exon_number']+1,1) :
+ len_exon = gene['exon'][exon]['stop']-gene['exon'][exon]['start']
+ idx = gene['exon'][exon]['start'] - start
+ exo = arange(idx,idx+len_exon)
+ for z in exo :
+ tabF[z] = 3
+ tabR[z] = 2
+ tabF.reverse()
+ tabR.reverse()
+ #pl.ylim(0,5)
+ ax1.plot(tabF,color='#9B9E9C')
+ ax1.plot(tabR,color='#9B9E9C')
+ x = arange(len(tabR))
+ ax1.fill_between(x,tabF,tabR,color='#9B9E9C')
+
+ ## insert arrows (as genome browser representation)
+ yloc = 2.5
+ narrows = 20
+ exonwidth = .8
+ spread = .4 * len_gene / narrows
+ for i in range(narrows):
+ loc = (float(i) * len_gene / narrows)+ (len(idx_tot)/100)*2
+ if strand == '+' :
+ x = [loc - spread, loc, loc - spread]
+ else:
+ x = [loc - spread, loc, loc - spread]
+ y = [yloc - exonwidth / 5, yloc, yloc + exonwidth / 5]
+ ax1.plot(x, y, lw=1.4, color='#676B69')
+
+
+ # plot coverage in all gene + extension
+ ax2 = fig.add_subplot(3,1,2)
+ ## fixe limit to length of coverage for x axis
+ ax2.set_xlim(0,len(idx_tot))
+ ## fixe 4 ticks and associated labels for y axis
+ ax2.set_yticklabels(arange(0,int(max(cov)+20),int((max(cov)+20)/4)))
+ ax2.set_yticks(arange(0,int(max(cov)+20),int((max(cov)+20)/4)))
+ ### add start and stop of gene in axe in place of number
+ ax2.set_xticks([100,(100+len_gene),(100+len_ext)])
+ ax2.set_xticklabels(["{:,}".format(gene['start']),"{:,}".format(gene['stop']),""])
+ ## hide spines and any axis
+ ax2.spines['right'].set_visible(False)
+ ax2.spines['top'].set_visible(False)
+ ax2.yaxis.set_ticks_position('left')
+ ax2.xaxis.set_ticks_position('bottom')
+ ## plot and fill
+ ax2.plot(idx_tot, cov, color="#CC0011")
+ ax2.fill_between(idx_tot, 0,cov,color="#CC0011")
+ ax2.set_title("Genomic coordinates (%s,%s)"% (gene['strand'],gene['chrom']) ,fontdict={'fontsize':'small'})
+
+ # plot zoom coverage in extension
+ ax3 = fig.add_subplot(3,1,3)
+ ## fixe limit to length of coverage for x axis
+ ax3.set_ylim(0,int(_max))
+ # we hide spines
+ ax3.spines['right'].set_visible(False)
+ ax3.spines['top'].set_visible(False)
+ ax3.yaxis.set_ticks_position('left')
+ ax3.xaxis.set_ticks_position('bottom')
+ ## add stop and in-frame stop in x axis
+ pl.xticks([30,( stop_extension-start_extension)+30],["stop codon","next in-frame stop"])
+ ## fixe good position for labels
+ (ax3.get_xticklabels()[0]).set_horizontalalignment('center')
+ (ax3.get_xticklabels()[1]).set_horizontalalignment('left')
+ #if max coverage is lower than 2, we have a illegal division by zero
+ if _max > 2 :
+ ax3.set_yticklabels(arange(0,int(_max+1),int((_max+1)/3)))
+ ax3.set_yticks(arange(0,int(_max+1),int((_max+1)/3)))
+ else :
+ ##
+ ax3.set_ylim(0,_max)
+ ax3.ticklabel_format(style='sci', scilimits=(-2,2), axis='y',useOffset=False)
+
+ ax3.plot(idx_ext, cov_ext, color="#CC0011")
+ ax3.fill_between(idx_ext, 0,cov_ext,color="#CC0011")
+
+
+ ## get scale of subplot 3
+ #ax3.text(ax3.get_xlim()[1]-50,ax3.get_ylim()[1]-1, r'50 nt', fontdict=font)
+ #pl.arrow( ax3.get_xlim()[1]-50, ax3.get_ylim()[1]-2, ax3.get_xlim()[1]-50, 0, fc="grey", ec="grey",lw=2)
+
+ ## set common y label
+ ax.set_ylabel('Normalized footprint counts',labelpad=20)
+
+ ## draw and save plot
+ pl.draw()
+ #pl.show()
+ pl.savefig(dirout+".png",format='png', dpi=300)
+ pl.clf()
+
+
+ ## Make thumbnail for html page
+ infile = dirout+'.png'
+ size = 128, 128
+ im = Image.open(infile)
+ im.thumbnail(size, Image.ANTIALIAS)
+ im.save(dirout+"_thumbnail.png", "PNG")
+ warnings.resetwarnings()
+
+ except Exception, e:
+ stop_err( 'Error during gene plotting : ' + str( e ) )
+
+
+def __percent__(prop):
+
+ if sum(prop) != 0 :
+ perc = [0,0,0]
+ if prop[0] != 0 :
+ perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
+ if prop[1] != 0 :
+ perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
+ if prop[2] != 0 :
+ perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
+ return perc
+ else :
+ return prop
+
+def compute_phasing(chrom, start, stop, aln, kmer, strand):
+
+ phasing = [0,0,0]
+ for track in aln.fetch(chrom, start, stop):
+ if strand == '-':
+ if track.is_reverse :
+ if len(track.seq) == kmer :
+ pos = stop - (track.pos + track.rlen - 12)
+ if pos > 0:
+ phasing[pos%3] += 1
+ else :
+ if not track.is_reverse :
+ if len(track.seq) == kmer :
+ pos = (track.pos + 12) - start
+ if pos > 0:
+ phasing[pos%3] += 1
+
+ #return __percent__(phasing)
+ return phasing
+
+
+def compute_analysis(bam, GFF, fasta, gff, dirout, extend, kmer) :
+
+ try:
+ #background_file = dirout+"/background_sequence.fasta"
+ #file_back = open(background_file, 'w')
+ #file_context = open(dirout+"/stop_context.fasta", 'w')
+ #file_extension = open(dirout+"/extensions.fasta", 'w')
+ ## Opening Bam file with pysam librarie
+ pysam.index(bam)
+ aln = pysam.Samfile(bam, "rb",header=True, check_header = True)
+ ## Opening fasta file in a dict with BioPython
+ SeqDict = SeqIO.to_dict(SeqIO.parse(open(fasta,"r"),"fasta"))
+
+ ## count total read in bam file
+ cmd = "samtools view -c %s " % (bam)
+ proc = subprocess.Popen( args=cmd, shell=True, stdout = subprocess.PIPE)
+ count_tot = int(proc.stdout.read().rstrip())
+ returncode = proc.wait()
+
+ ## opening a GFF reader for check overlapping
+ gff_reader = HTSeq.GFF_Reader(gff)
+
+ with open(dirout+"/readthrough_result.csv","w") as out :
+ myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence']
+ writer = csv.writer(out,delimiter=',')
+ writer.writerow(myheader)
+ for gene in GFF['order'] :
+ #print GFF[gene]
+ ## maybe no start position in GTF file so we must to check and replace
+ exon_number = GFF[gene]['exon_number']
+ try : GFF[gene]['start']
+ except :
+ if GFF[gene]['strand'] == '+' :
+ GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
+ else :
+ GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
+ ## also for stop coordinates
+ try : GFF[gene]['stop']
+ except :
+ if GFF[gene]['strand'] == '+' :
+ GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
+
+ else :
+ GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
+
+
+ indic = ""
+ # compute rpkm of CDS :
+ len_cds = GFF[gene]['stop']-GFF[gene]['start']
+ count_cds = 0
+ rpkm_cds = 0
+ count_cds = 0
+ try :
+ ### count method of pysam doesn't strand information
+ if GFF[gene]['strand'] == '+' :
+ for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) :
+ if not track.is_reverse :
+ count_cds += 1
+ else :
+ for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) :
+ if track.is_reverse :
+ count_cds += 1
+ rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot)
+ except ValueError:
+ ## genere warning about gtf coordinates
+ #warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" )
+ pass
+ ## Only if gene is translate :
+ if rpkm_cds > 0 and count_cds > 128:
+ ## search footprint in UTR3
+ count = 0
+ try :
+ if GFF[gene]['strand'] == '+' :
+ contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6])
+ #print gene, contexte_stop
+ start_extension = GFF[gene]['stop']
+ stop_extension = GFF[gene]['stop']+90
+ for track in aln.fetch(GFF[gene]['chrom'],start_extension+15,stop_extension) :
+ if not track.is_reverse :
+ count += 1
+
+ ## get sequence of extension
+ seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension])
+
+ else :
+ contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement())
+ #print gene, contexte_stop
+ start_extension = GFF[gene]['start']-90
+ stop_extension = GFF[gene]['start']
+ for track in aln.fetch(GFF[gene]['chrom'],start_extension,stop_extension-15) :
+ if track.is_reverse :
+ count += 1
+ ## get sequence of extension
+ seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
+
+
+ ## if we have coverage after cds stop codon
+ if count > 10 :
+ res = find_stop(seq)
+ if res == -1 :
+ '''
+ Write results with no stop but RPF in extension
+ '''
+ ## check if next in-frame codon is far than 90 nt extension :
+ if GFF[gene]['strand'] == '+' :
+ pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+extend,'+',GFF[gene]['name'])
+ start_extension = pos[0]-1
+ stop_extension = pos[1]
+ #start_extension = GFF[gene]['stop']
+ #stop_extension = GFF[gene]['stop']+extend
+ seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension])
+
+ #print gene,count,pos,'\n',seq
+
+ if (seq):
+ res = find_stop(seq)
+ if res == -1 :
+ mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
+ writer.writerow(mylist)
+ else :
+ indic = 'ok'
+ #print res
+ #stop_extension = start_extension + res +3
+ else :
+ pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-extend,GFF[gene]['start']-1,'-',GFF[gene]['name'])
+ start_extension = pos[0]
+ stop_extension = pos[1]+1
+ #start_extension = GFF[gene]['start']-extend
+ #stop_extension = GFF[gene]['start']
+ seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
+
+ #print gene,count,pos,'\n',seq
+
+ if (seq):
+ res = find_stop(seq)
+ if res == -1 :
+ mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
+ writer.writerow(mylist)
+ else :
+ indic = 'ok'
+ #print res
+ #start_extension = stop_extension - res -3
+ else :
+ indic = 'ok'
+
+
+ if indic == 'ok' :
+ ## We save new coordinates
+ if GFF[gene]['strand'] == '+' :
+ stop_extension = start_extension + res +3
+ #print gene, count
+ #print gene,start_extension,stop_extension
+ seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension])
+ #print seq
+ count_stop = aln.count(GFF[gene]['chrom'],stop_extension-2,stop_extension+2)
+ if pass_length(start_extension,stop_extension) :
+ count_ext = aln.count(GFF[gene]['chrom'],start_extension+9,stop_extension-15)
+ if stop_extension > GFF[gene]['stop']+9 :
+ stop_ok = 1
+ else :
+ stop_ok = 0
+ else :
+ start_extension = stop_extension - res - 3
+ #print gene, count
+ #print gene,start_extension,stop_extension
+ seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension-1:stop_extension-1].reverse_complement())
+ #print seq
+ count_stop = aln.count(GFF[gene]['chrom'],start_extension-2,start_extension+2)
+ if pass_length(start_extension,stop_extension) :
+ count_ext = aln.count(GFF[gene]['chrom'],start_extension+15,stop_extension-9)
+ if start_extension < GFF[gene]['start']-9 :
+ stop_ok = 1
+ else :
+ stop_ok = 0
+ ## if we are no methionine in 5 codons following stop of CDS
+ if (not check_met(seq) ):
+ ## if we have footprint in stop codon extension and stop extension is further than cds_stop+9
+ if count_stop > 2 and stop_ok == 1 :
+ homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln)
+ # phasing = compute_phasing(GFF[gene]['chrom'],start_extension,stop_extension, aln, kmer,GFF[gene]['strand'])
+ # print gene, phasing
+# count_after_stop = 0
+# try :
+# ### count method of pysam doesn't strand information
+# if GFF[gene]['strand'] == '+' :
+# for track in aln.fetch(GFF[gene]['chrom'],stop_extension+15,stop_extension+40) :
+# if not track.is_reverse :
+# count_after_stop += 1
+# else :
+# for track in aln.fetch(GFF[gene]['chrom'],start_extension-40,start_extension-15) :
+# if track.is_reverse :
+# count_after_stop += 1
+# except ValueError:
+# ## genere warning about gtf coordinates
+# warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" )
+# pass
+# print gene+"\t"+str(count_stop)+"\t"+str(count_after_stop)
+# if (count_stop*5)/100 > count_after_stop :
+# print "moins de 5%...\n"
+ if (homo_cov) :
+ '''
+ write result witch corresponding to readthrough
+ '''
+ ##if length of extension upper than 25 we can compute rpkm
+ if (not pass_length(start_extension,stop_extension)) :
+ len_ext = stop_extension-start_extension
+ rpkm_ext = 'nan'
+ ratio = 'nan'
+ else :
+ len_ext = stop_extension-start_extension
+ rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot)
+ ## compute ratio between extension coverage and cds coverage (rpkm)
+ ratio = rpkm_ext/rpkm_cds
+ #print gene,ratio
+ #print start_extension,stop_extension
+ mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']]
+ writer.writerow(mylist)
+ #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+ #file_extension.write('>'+gene+'\n'+seq+'\n')
+ else :
+ '''
+ write result witch corresponding to readthrough but with no homogeneous coverage
+ '''
+ if (not pass_length(start_extension,stop_extension)) :
+ len_ext = stop_extension-start_extension
+ rpkm_ext = 'nan'
+ ratio = 'nan'
+ else :
+ len_ext = stop_extension-start_extension
+ rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot)
+ ## compute ratio between extension coverage and cds coverage (rpkm)
+ ratio = rpkm_ext/rpkm_cds
+ mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']]
+ writer.writerow(mylist)
+ #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+ #file_extension.write('>'+gene+'\n'+seq+'\n')
+ #print ">"+gene+"\n"+contexte_stop
+
+ ## plot gene :
+ plot_gene(aln, GFF[gene], start_extension, stop_extension, dirout+"/"+gene)
+
+
+
+ else :
+ '''
+ write result with no footprint in stop codon of extension
+ '''
+ mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
+ writer.writerow(mylist)
+ #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+ #file_extension.write('>'+gene+'\n'+seq+'\n')
+ #print ">"+gene+"\n"+contexte_stop
+ else :
+ '''
+ write result with RPF maybe result of reinitiation on a start codon
+ '''
+ if pass_length(start_extension,stop_extension) :
+ mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
+ writer.writerow(mylist)
+ #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+ #file_extension.write('>'+gene+'\n'+seq+'\n')
+ #print ">"+gene+"\n"+contexte_stop
+# else :
+# ## if its not a interesting case, we get stop context of genes without readthrough
+# if GFF[gene]['strand'] == '+' :
+# contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6])
+# file_back.write(contexte_stop+'\n')
+# else :
+# contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement())
+# file_back.write(contexte_stop+'\n')
+#
+ ## excluded UT with incorrect positions
+ except ValueError:
+ pass
+
+
+ #file_context.close()
+ #file_back.close()
+ #file_extension.close()
+ except Exception,e:
+ stop_err( 'Error during computing analysis : ' + str( e ) )
+
+
+
+def write_html_page(html,subfolder) :
+
+
+ try :
+
+ gene_table = ''
+ gene_table += '
'
+ gene_table += '
Gene
Plot
Name
Stop context
Coordinates
RPKM CDS
RPKM extension
ratio
Extension
'
+
+ with open(os.path.join(subfolder,'readthrough_result.csv'), 'rb') as csvfile:
+ spamreader = csv.reader(csvfile, delimiter=',')
+ ## skip the header
+ next(spamreader, None)
+ ##test if file is empty or not
+ if next(spamreader, None):
+ for row in spamreader:
+ if row[2] == '-' :
+ gene_table += '