Mercurial > repos > jay > pdaug_peptide_ngrams

import matplotlib
matplotlib.use('Agg')
import os
import sys
sys.path.insert(0, os.path.abspath('..'))

import quantiprot
from quantiprot.utils.io import load_fasta_file
from quantiprot.utils.feature import Feature, FeatureSet
from quantiprot.metrics.aaindex import get_aa2volume, get_aa2hydropathy
from quantiprot.metrics.basic import average

from matplotlib import pyplot as plt


from math import log10, floor
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import fisher_exact
from quantiprot.utils.sequence import SequenceSet, compact


def _count_frame(data, frame_range, num_bins):
    """
    Count instances in a 2D frame

    The function discretizes the feature space into a grid of cells.
    Then it counts the number of instances that fall into each cell.
    An efficient method for counting instances is used. It performs parallel
    logical comparisons of data instances to vectors that hold information on
    grid lines.

    Args:
        data (numpy.matrix): a Nx2 data matrix
        frame_range (numpy.matrix): a 2x2 matrix which defines feature ranges
        num_bins (list): a pair defining the resolution of the 2D grid
    Returns:
        cell_counts (numpy.matrix): a matrix holding counts of instances in
            each grid cell
        bin_ranges (tuple): a pair of numpy matrices holding information on
            bin(grid_cell) ranges
    """
    grid_x = np.linspace(start=frame_range[0, 0], stop=frame_range[1, 0],\
                          num=num_bins[0]+1, endpoint=True)
    grid_y = np.linspace(start=frame_range[0, 1], stop=frame_range[1, 1],\
                          num=num_bins[1]+1, endpoint=True)
    # copy because we add ones in the next lines
    bin_ranges = (np.copy(grid_x), np.copy(grid_y))


    #Count points in each grid cell
    grid_x[-1] += 1 # the last cell has to contain data at the border
    grid_y[-1] += 1 # the last cell has to contain data at the border

    gte_x = np.matrix(data[:, 0] >= grid_x, dtype='float64')
    lt_x = np.matrix(data[:, 0] < grid_x, dtype='float64')
    gte_y = np.matrix(data[:, 1] >= grid_y, dtype='float64')
    lt_y = np.matrix(data[:, 1] < grid_y, dtype='float64')

    dif_x = gte_x - lt_x
    dif_y = gte_y - lt_y

    bins_x = dif_x.argmin(axis=1) - 1
    bins_y = dif_y.argmin(axis=1) - 1

    coords = np.concatenate((bins_x, bins_y), axis=1)

    cell_counts = np.zeros(shape=(len(grid_x)-1, len(grid_y)-1))

    for i in range(coords.shape[0]):
        cell_counts[coords[i, 0], coords[i, 1]] += 1

    return cell_counts, bin_ranges


def local_fisher_2d(set1, set2, features=None, \
                    windows_per_frame=10, overlap_factor=1, frame_range=None):
    """
    Compare local and global distribution of samples from two populations
    in the 2d feature space using the Fisher's exact test.

    The function performs the Fisher Exact Test for comparing local and global
    ratia of instance counts from two different populations. It uses the
    '_count_frame' function to discretize the feature space and get instance
    counts. Then it scans the 2d feature space with a sliding window and
    performs the Fisher Exact test.

        Args:
            set1 (SequenceSet or numpy.matrix): the first set with at least
                2 sequence features.
            set2 (SequenceSet or numpy.matrix): the second set with at least
                2 sequence features.
            features (tuple or list): strings with feature names for running
                the 2d Fisher test. If None then the first two features are
                used. Relevant only if 'set1' or 'set2' are SequenceSets.
            windows_per_frame (int): ratio between the whole feature space and
                the sliding window (default 10).
            overlap_factor (int):ratio between the size of a sliding window
                and a discretization grid cell (default 1).
            frame_range(numpy.matrix): 2x2 matrix with range of features
                in both dimensions.

        Returns final_res (dict): a dictionary including:
            'odds_ratio' (numpy.matrix): a matrix of odds_ratios obtained
                in each sliding window position.
            'p_value' (numpy.matrix): a matrix containing Fisher test outcome
                pvalues in each sliding window position.
            'w_counts1' (numpy.matrix): a matrix with first population instance
                counts in each sliding window position.
            'w_counts2' (numpy.matrix): a matrix with second population instance
                counts in each sliding window position.
            'w_center_x' (numpy.matrix): matrix containing coordinates of window
                centers in the X dimension.
            'w_center_y' (numpy.matrix): matrix containing coordinates of window
                centers in the Y dimension.
            '_bin_ranges_x' (numpy.matrix): matrix containing bin(grid_cell)
                ranges in the X dimension.
            '_bin_ranges_y' (numpy.matrix): matrix containing bin(grid_cell)
                ranges in the Y dimension.
    """

    if isinstance(set1, SequenceSet):
        mat1 = np.transpose(np.matrix(compact(set1,
                                              features=features).columns()))
    if isinstance(set2, SequenceSet):
        mat2 = np.transpose(np.matrix(compact(set2,
                                              features=features).columns()))

    #Deal with window_per_frame and overlap_factor
    #given either as a scalar or as a list-like
    if not hasattr(windows_per_frame, "__len__"):
        w_per_frame = (windows_per_frame, windows_per_frame)
    else:
        w_per_frame = (windows_per_frame[0], windows_per_frame[1])

    if not hasattr(overlap_factor, "__len__"):
        w_size = (overlap_factor, overlap_factor)
    else:
        w_size = (overlap_factor[0], overlap_factor[1])

    num_bins = (w_per_frame[0]*w_size[0], w_per_frame[1]*w_size[1])

    if frame_range is None:
        #Evaluate the range of features in both populations.

        frame_range = np.concatenate((np.minimum(mat1.min(0), mat2.min(0)),\
                                      np.maximum(mat1.max(0), mat2.max(0))))

        margin_x = (frame_range[1, 0] - frame_range[0, 0])/w_per_frame[0]
        margin_y = (frame_range[1, 1] - frame_range[0, 1])/w_per_frame[1]

        frame_range[0, 0] -= margin_x
        frame_range[1, 0] += margin_x

        frame_range[0, 1] -= margin_y
        frame_range[1, 1] += margin_y

    #Discretize feature space into NxM grid,
    #where N = w_per_frame[0]*w_size[0].
    #      M = w_per_frame[1]*w_size[1].
    #count instances of population1 and population2 in each grid cell.
    #both bin ranges are always the same because the frame range is common.
    cell_counts1, bin_ranges = _count_frame(mat1, frame_range=frame_range,\
                                          num_bins=num_bins)
    cell_counts2, _ = _count_frame(mat2, frame_range=frame_range,\
                                          num_bins=num_bins)

    #Number of windows that fit in a single row/column of a frame
    w_number = (cell_counts1.shape[0]-w_size[0]+1,
                cell_counts1.shape[1]-w_size[1]+1)

    #Initialize matrices holding counts at scanning window positions.
    window_counts1 = np.zeros(shape=w_number)
    window_counts2 = np.zeros(shape=w_number)

    #Initialize matrices holding window coordinates
    window_center_x = np.zeros(shape=w_number[0])
    window_center_y = np.zeros(shape=w_number[1])

    #Initialize matrices holding Fisher Exact test results
    fisher_pv = np.ones(shape=w_number)
    odds_ratio = np.ones(shape=w_number)

    #Calculate population totals in the whole feature space
    all1 = cell_counts1.sum()
    all2 = cell_counts2.sum()

    #Calculate window centers
    for start_x in range(0, w_number[0]):
        window_center_x[start_x] = (bin_ranges[0][start_x]+ \
                                    bin_ranges[0][start_x+w_size[0]])/2
    for start_y in range(0, w_number[1]):
        window_center_y[start_y] = (bin_ranges[1][start_y]+ \
                                    bin_ranges[1][start_y+w_size[1]])/2

    #Scan the feature space with a step of 1 cell.
    for start_x in range(0, w_number[0]):

        for start_y in range(0, w_number[1]):
            #Count instances of each population in the window
            window_counts1[start_x, start_y] = \
                cell_counts1[start_x:(start_x+w_size[0]), \
                             start_y:(start_y+w_size[1])].sum()
            window_counts2[start_x, start_y] = \
                cell_counts2[start_x:(start_x+w_size[0]), \
                             start_y:(start_y+w_size[1])].sum()
            #Perform the Fisher Exact Test against
            #h0: population ratio in the window the same as in the whole space.
            odds_ratio[start_x, start_y], fisher_pv[start_x, start_y] =\
                fisher_exact([[all1, window_counts1[start_x, start_y]],\
                              [all2, window_counts2[start_x, start_y]]])

    fisher_res = {'p_value':fisher_pv, 'odds_ratio':odds_ratio,\
                'w_counts1':window_counts1, 'w_counts2':window_counts2,\
                'w_center_x':window_center_x, 'w_center_y':window_center_y,\
                '_bin_ranges_x':bin_ranges[0], '_bin_ranges_y':bin_ranges[1]}

    return fisher_res


def _plot_local_fisher_2d(fisher_res, xlabel="feat_1", ylabel="feat_2",
                          pop1_label="pop_1", pop2_label="pop_2", out_file_path=None, fig_width=8, fig_hight=8, fig_hspace=0.35, fig_wspace=0.25):
    """
    Plot results of the local Fisher's extact test in the 2d space.

    Args:
        fisher_res (dict): output from 'fisher_local_2d'.
        xlabel (str): name of the 1st feature to appear in the plots
            (default: "feat_1")
        ylabel (str): name of the 2nd feature to appear in the plots
            (default: "feat_2")
        pop1_label (str): name of the 1st population to appear in the plots
            (default: "pop_1")
        pop2_label (str): name of the 2nd population to appear in the plots
            (default: "pop_2")
    """
    fisher_or = fisher_res["odds_ratio"]
    fisher_c1 = fisher_res["w_counts1"]
    fisher_c2 = fisher_res["w_counts2"]
    fisher_pv = fisher_res["p_value"]

    for pos_x in range(len(fisher_or)):
        for pos_y in range(len(fisher_or[0])):
            if fisher_c1[pos_x][pos_y] == 0 and fisher_c2[pos_x][pos_y] == 0:
                fisher_or[pos_x][pos_y] = np.nan
            elif fisher_c1[pos_x][pos_y] == 0:
                fisher_or[pos_x][pos_y] = np.inf
            elif fisher_c2[pos_x][pos_y] == 0:
                fisher_or[pos_x][pos_y] = -np.inf
            elif fisher_or[pos_x][pos_y] < 1:
                fisher_or[pos_x][pos_y] = -1.0/fisher_or[pos_x][pos_y]

    vmax_abs = np.nanmax(np.abs([x for x in np.array(fisher_or).flatten()
                                 if x > -np.inf and x < np.inf]))

    for pos_x in range(len(fisher_or)):
        for pos_y in range(len(fisher_or[0])):
            if abs(fisher_or[pos_x][pos_y]) == np.inf:
                fisher_or[pos_x][pos_y] = np.sign(fisher_or[pos_x][pos_y])*vmax_abs

    ##### Extra Fig perimeters added ################################
    plt.figure(figsize=(fig_width, fig_hight)) # Figure size
    plt.subplots_adjust(hspace = fig_hspace, wspace = fig_wspace) # space between the subplots.
    ##################################################################

    plt.subplot(221)
    plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"],
                   np.ma.masked_invalid(fisher_c1).T, cmap="Reds")
    plt.colorbar()
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title("Counts "+pop1_label)

    plt.subplot(222)
    plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"],
                   np.ma.masked_invalid(fisher_c2).T, cmap="Reds")
    plt.colorbar()
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title("Counts "+pop2_label)

    cmap = plt.get_cmap('RdBu')
    cmap.set_bad(color='k', alpha=1.)

    cbar_lo = 1.0/vmax_abs
    cbar_lo_places = max(0, -floor(log10(cbar_lo))+1)
    cbar_hi = vmax_abs
    cbar_hi_places = max(0, -floor(log10(cbar_hi))+1)

    plt.subplot(223)
    plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"],
                   np.ma.masked_invalid(fisher_or).T, cmap=cmap,
                   vmin=-vmax_abs, vmax=vmax_abs)
    cbar = plt.colorbar(ticks=([-vmax_abs, 0, vmax_abs]))
    cbar.ax.set_yticklabels(['< '+str(round(cbar_lo, int(cbar_lo_places))), '1',
                             '> '+str(round(cbar_hi, int(cbar_hi_places)))])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title("Odds ratio")

    plt.subplot(224)
    plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"],
                   np.log10(np.ma.masked_invalid(fisher_pv)).T, cmap="RdGy")
    plt.colorbar()
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title("Fisher test\np-value (logarithm of 10)")

    #Savefig function added with preserving default behavior

    if out_file_path==None:
        plt.show()
    else:
        plt.savefig(out_file_path,dpi=300)


def HTML_Gen(html):

    out_html = open(html,'w')
    part_1 =  """

    <!DOCTYPE html>
    <html lang="en">
    <head>
      <title>Bootstrap Example</title>
      <meta charset="utf-8">
      <meta name="viewport" content="width=device-width, initial-scale=1">
      <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/css/bootstrap.min.css">
      <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
      <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/js/bootstrap.min.js"></script>
    <body>
    <style>
    div.container_1 {
      width:600px;
      margin: auto;
     padding-right: 10;
    }
    div.table {
      width:600px;
      margin: auto;
     padding-right: 10;
    }
    </style>
    </head>
    <div class="jumbotron text-center">
      <h1> Fisher's Plot </h1>
    </div>
    <div class="container">
      <div class="row">
        <div class="col-sm-4">
          <img src="1.png" alt="Smiley face" height="800" width="800">
        </div>

      </div>
    </div>
    </body>
    </html>
    """
    out_html.write(part_1)
    out_html.close()
# Load sets of amyloidogenic and non-amyloidogenic peptides:

def run(Fasta1, Fasta2, windows_per_frame, overlap_factor, xlabel, ylabel, pop1_label, pop2_label, htmlOutDir, htmlFname, Workdirpath):

    if not os.path.exists(htmlOutDir):
        os.makedirs(htmlOutDir)

    amyload_pos_seq = load_fasta_file(Fasta1)
    amyload_neg_seq = load_fasta_file(Fasta2)

    # Calculate quantitive features: volume and hydropathy
    mean_volume = Feature(get_aa2volume()).then(average)
    mean_hydropathy = Feature(get_aa2hydropathy()).then(average)

    fs = FeatureSet("volume'n'hydropathy")
    fs.add(mean_volume)
    fs.add(mean_hydropathy)

    amyload_pos_conv_seq = fs(amyload_pos_seq)
    amyload_neg_conv_seq = fs(amyload_neg_seq)

    # Do local Fisher:
    result = local_fisher_2d(amyload_pos_conv_seq, amyload_neg_conv_seq,
                             windows_per_frame=int(windows_per_frame), overlap_factor=int(overlap_factor))

    # Plot local Fisher:
    _plot_local_fisher_2d(result, xlabel=xlabel,
                                  ylabel=ylabel,
                                  pop1_label=pop1_label,
                                  pop2_label=pop2_label,
                                  out_file_path =os.path.join(os.getcwd(), "out.png")
                                  )


    #   plt.savefig(os.path.join(Workdirpath, htmlOutDir, "1.png"))

    HTML_Gen(os.path.join(Workdirpath, htmlOutDir, htmlFname))

if __name__=="__main__":


    import argparse

    parser = argparse.ArgumentParser()

    parser.add_argument("-f1", "--Fasta1", required=True, default=None, help="First fasta file ")
    parser.add_argument("-f2", "--Fasta2", required=True, default=None, help="Second fasta file")
    parser.add_argument("-o", "--overlap_factor", required=False, default=5, help="Overlap factor")
    parser.add_argument("-w", "--windows_per_frame", required=False, default=5, help="Windows per frame")
    parser.add_argument("-x", "--xlabel", required=True, default=None, help="X label")
    parser.add_argument("-y", "--ylabel", required=True, default=None, help="Y label")
    parser.add_argument("-p1", "--pop1_label", required=True, default=None, help="First population label")
    parser.add_argument("-p2", "--pop2_label", required=True, default=None, help="Second population label")
    parser.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'),  help="Path to html dir")
    parser.add_argument("--htmlFname",  required=False, help="html output file", default="report.html")
    parser.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Path to output Working Directory")
    args = parser.parse_args()

    run(args.Fasta1, args.Fasta2, args.windows_per_frame, args.overlap_factor, args.xlabel, args.ylabel, args.pop1_label, args.pop2_label, args.htmlOutDir, args.htmlFname, args.Workdirpath)