Source code for sa.plotting

"""
##########################
plotting (``sa.plotting``)
##########################

This script contains various plotting functions for strains analysis, such as histograms, PCA projection, MDS,
Silhouette plot.
    
"""
import numpy as np
import scipy as sp
import random

from sklearn.metrics.pairwise import euclidean_distances
from matplotlib import pyplot as plt
from matplotlib import cm
from matplotlib import mlab 
from matplotlib import patches
from mpl_toolkits.mplot3d import Axes3D, proj3d

[docs]def plot_hist_coll(colls, colls_labels, out_dir): """ Plot and save to directory :param:`out_dir` overlayed histograms of distances between collections :param:`colls`. :param colls_labels: Names of collections as ordered and specified by parameter :param:`colls`. :type colls_labels: `list` """ fig = plt.figure(figsize = (12, 8)) ax1 = fig.add_subplot(111) dist = [np.tril(euclidean_distances(colls[0], colls[0]), -1)] + [euclidean_distances(colls[0], coll) for coll in colls[1:]] dist = [dist[0].ravel()[np.flatnonzero(dist[0])]] + [ds.ravel() for ds in dist[1:]] means = [] for lab, ds in zip(colls_labels, dist): means.append(np.mean(ds)) print "Mean observation distance %s in total: %5.6f" % (colls_labels[0] + "-" + lab, means[-1]) clr = ["b", "y", "r", "c", "m", "g", "k", "w"] plt.hist(dist, 50, weights = [len(dist[0])/float(len(ds)) * np.ones_like(ds) for ds in dist], histtype='bar', label = ["%s-%s, mean = %5.3f" % (colls_labels[0], lab, ms) for ms, lab in zip(means, colls_labels)], color = clr[:len(dist)]) ax1.set_xlabel("Distance") plt.legend() plt.savefig(out_dir + "distance_hist_%s.pdf" % "-".join(colls_labels)) plt.close(fig)
[docs]def plot_plate_by_mean_well_distance(dist_matrix, title, out_name, plate): """ Produce matrix plot of wells as placed on the plate. The entries in matrix are mean observation distances in param:`dist_matrix` from other observations on plate. Save to file named :param:`out_name`. White entries in the plot represent missing strains or removed WT strains due to outlier detection. :param title: Plate identifier used as part of title in the plot. :type title: `str` Return matrix of mean well distances. """ N = dist_matrix.shape[0] nrow = max([float(plate[i][3]) for i in xrange(N)]) ncol = max([float(plate[i][4]) for i in xrange(N)]) mw_matrix = np.zeros((nrow, ncol)) proc = np.zeros((nrow, ncol)) for i, el in enumerate(plate): r = int(el[3]) - 1 c = int(el[4]) - 1 proc[r,c] = 1 mw_matrix[r,c] = np.mean(dist_matrix[i, [j for j in xrange(N) if j != i]]) fig = plt.figure(figsize = (10, 8)) ax1 = fig.add_subplot(111) imax = ax1.imshow(mw_matrix, interpolation = "nearest") for r in xrange(proc.shape[0]): for c in xrange(proc.shape[1]): if proc[r,c] == 1: continue rect1 = patches.Rectangle((-0.5 + c, -0.5 + r), 1, 1, color='white') ax1.add_patch(rect1) ax1.set_title("Plate %s" % title) ax1.yaxis.tick_left() ax1.xaxis.tick_bottom() #the smallest values are lighter and the larger values are darker shades of grey fig.colorbar(imax) fname = out_name + "_mean_well.pdf" plt.savefig(fname) plt.close(fig) print "Mean observation distances as located in plate saved to file: %s" % fname return mw_matrix
[docs]def plot_hist_with_norm_fit(X, attr, title, out_name): """ Plot a histogram of of input data and plot the analytic normal PDF over it. Save it to a file :param:`out_name`. :param title: Plate identifier used as part of title in the plot. :type title: `str` """ mu = np.mean(X) sigma = np.std(X) fig = plt.figure() ax = fig.add_subplot(111) n, bins, patches = ax.hist(X, 50, normed = True, facecolor = "green", alpha = 0.75) bincenters = 0.5*(bins[1:] + bins[:-1]) y = mlab.normpdf(bincenters, mu, sigma) ax.plot(bincenters, y, 'r--', linewidth = 1) ax.set_xlabel(attr) ax.set_title("Attribute histogram for strain %s" % title) ax.grid(True) fname = out_name + "_plot_hist_with_norm_fit.pdf" plt.savefig(fname) plt.close(fig) print "Histogram of attribute data %s with analytic normal PDF over saved to: %s" % (attr, fname)
[docs]def plot_pca_projection(X, plate, title, out_name): """ Plot 3D projection along PCA components with the most variance and save it to a file :param:`out_name`. :param X: Transformed data set of computational features (dimensionality reduction). :type X: `numpy.array` :param plate: Plate data as returned from :mod:`utilities.read`. :type plate: `list` """ fig = plt.figure(figsize=(10, 8)) ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) ax.scatter(X[:, 0], X[:, 1], X[:, 2], cmap=cm.spectral) ax.set_title("3D PCA projection for %s" % title) dist_matrix = euclidean_distances(X, X) print "Mean Euclidean distance after transforming data by PCA: %5.3f" % np.mean(dist_matrix) T = np.mean(dist_matrix)/20. for i in xrange(X.shape[0]): if np.min(dist_matrix[i,[j for j in xrange(X.shape[0]) if j != i]]) < T: continue x2, y2, _ = proj3d.proj_transform(X[i, 0], X[i, 1], X[i, 2], ax.get_proj()) label = "%s,%s" % (plate[i][3], plate[i][4]) ax.annotate(label, xy=(x2, y2), xytext=((-1 if random.random() > 0.5 else 1)*20, 20*random.random()), textcoords='offset points', ha='center', va='bottom', fontsize=8, bbox=dict(boxstyle='round,pad=0.2', fc='yellow', alpha=0.3), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.5', color='red')) fname = out_name + "_pca_3d.pdf" plt.savefig(fname) plt.close(fig) print "3D PCA projection saved to file: %s" % fname
[docs]def plot_silhouette(scores, c_idx, K, out_name): """ Save a silhouette plot to file named :param:`out_name`, showing the distributions of silhouette scores :param:`scores` in clusters. Scores represent the silhouette scores for each observation. :param scores: Silhouette score for each observation. :type scores: `list` :param c_idx: Cluster index for each observation. :type c_idx: `list` :param K: The number of clusters. :type K: `int` """ order = np.lexsort((-scores, c_idx)) indices = [np.flatnonzero(c_idx[order] == k) for k in xrange(K)] ytick = [(np.max(ind) + np.min(ind))/2. for ind in indices] ytick_labels = ["%d" % x for x in xrange(K)] cmap = cm.jet(np.linspace(0, 1, K)).tolist() clr = [cmap[i] for i in c_idx[order]] fig = plt.figure() ax1 = fig.add_subplot(111) ax1.barh(range(len(scores)), scores[order], height = 1.0, edgecolor = "none", color = clr) ax1.set_ylim(ax1.get_ylim()[::-1]) plt.yticks(ytick, ytick_labels) plt.xlabel("Silhouette value") plt.ylabel("Cluster") fname = out_name + "_%d_silhouette.pdf" % K plt.savefig(fname) plt.close(fig) print "Silhouette saved to file: %s" % fname
[docs]def plot_hist_mean_MT_WT_distance(m_mt_wt, out_dir): """ Plot histogram of mean distances of mutant strains to WT strains and save the histogram to directory :param:`out_dir`. :param m_mt_wt: Mean distance of each mutant strain to WT strains averaging distances if replicates exist. :type m_mt_wt: `list` """ mean_mt_wt = np.mean(m_mt_wt) fig = plt.figure() ax = fig.add_subplot(111) ax.hist(m_mt_wt, 50, facecolor = "green", alpha = 0.75, histtype = "bar", label = "Mean MT-WT [std = %5.3f, min = %5.3f, max = %5.3f]" % (sp.std(m_mt_wt), np.min(m_mt_wt), np.max(m_mt_wt))) ax.plot(mean_mt_wt, 0, "ko") ax.annotate("mean = %5.3f" % mean_mt_wt, xy=(mean_mt_wt, 0), xytext=(60,50), textcoords='offset points', ha='center', va='bottom', bbox=dict(boxstyle='round,pad=0.2', fc='yellow', alpha=0.3), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=-0.5', color='red')) ax.set_xlabel("Distance") ax.set_title("Histogram of mean distances of mutant strains to WT strains") ax.grid(True) plt.legend(prop = {"size":8}) fname = out_dir + "hist_mean_MT-WT_distances.pdf" print "Saving histogram of mean mutant distances to WT strains to file %s" % fname plt.savefig(fname) plt.close(fig)
[docs]def plot_hist_WT_WT_distance(wt_wt, out_dir): """ Plot histogram of distances between WT strains and save the histogram to directory :param:`out_dir`. :param wt_wt: Distances between all pairs of WT strains. :type wt_wt: `list` """ mean_wt_wt = np.mean(wt_wt) fig = plt.figure() ax = fig.add_subplot(111) ax.hist(wt_wt, 50, facecolor = "green", alpha = 0.75, histtype = "bar", label = "[std = %5.3f, min = %5.3f, max = %5.3f]" % (sp.std(wt_wt), np.min(wt_wt), np.max(wt_wt))) ax.plot(mean_wt_wt, 0, "ko") ax.annotate("mean = %5.3f" % mean_wt_wt, xy=(mean_wt_wt, 0), xytext=(60,50), textcoords='offset points', ha='center', va='bottom', bbox=dict(boxstyle='round,pad=0.2', fc='yellow', alpha=0.3), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=-0.5', color='red')) ax.set_xlabel("Distance") ax.set_title("Histogram of distances between WT strains") ax.grid(True) plt.legend(prop = {"size":8}) fname = out_dir + "hist_WT-WT_distances.pdf" print "Saving histogram of distances between WT strains to file %s" % fname plt.savefig(fname) plt.close(fig)
[docs]def plot_hist_mean_MT_WT__WT_WT_distance(m_mt_wt, m_wt_wt, out_dir): """ Plot overlayed (i) histogram of mean distances of WT strains to other WT strains and (ii) and histogram of mean mutant distances to WT strains. Save the plot to directory :param:`out_dir` named `hist_WT-WT_MT-MT_distances.pdf`. :param m_mt_wt: Mean distances of MT strains to WT strains. :type m_mt_wt: `list` :param m_wt_wt: Mean distances between WT strain and other WT strains. :type m_wt_wt: `list` """ fig = plt.figure(figsize = (12, 8)) ax1 = fig.add_subplot(111) w0 = np.ones_like(m_mt_wt)/float(len(m_mt_wt))*len(m_wt_wt) w1 = np.ones_like(m_wt_wt) plt.hist([m_mt_wt, m_wt_wt], 50, weights = [w0, w1], histtype='bar', label = ["Mean MT-WT [mean = %5.3f, std = %5.3f, min = %5.3f, max = %5.3f]" % (np.mean(m_mt_wt), sp.std(m_mt_wt), np.min(m_mt_wt), np.max(m_mt_wt)), "Mean WT-WT [mean = %5.3f, std = %5.3f, min = %5.3f, max = %5.3f]" % (np.mean(m_wt_wt), sp.std(m_wt_wt), np.min(m_wt_wt), np.max(m_wt_wt))], color = ["b", "r"]) ax1.set_xlabel("Distance") ax1.grid(True) plt.legend(prop={"size":8}) plt.savefig(out_dir + "hist_MT-WT_WT-WT_distances.pdf") plt.close(fig)
[docs]def plot_hist_signif_MT_WT(mut, plates_mt, dnp_mtc, tot_mean, out_dir): """ Compute and plot observed mutant distance and mean mutant distance to sampled random set of mutant strains. Save histogram showing null distribution of distances and observed distance for each mutant to directory :param:`out_dir`. Compare observed mutant strain distance to WT strains and mutant distance to sampled sets of mutant strains: #. Construct N (100) random sets of size S (100) which contain mutant strains. #. Compute mean distance between mutant strain and each of constructed sets. #. Plot histogram of distances from previous step and observed mean distance. #. Repeat for each mutant strain. Mean distances are taken for replicates. :param mut: Observed distances of mutants and some meta data in the format [ (orf, mean_dist, [(pn1, date1, r1, c1, dist1), (pn2, date2, r2, c2, dist1), ...]) ] :type mut: `list` :param plates_mt: Complete profiles of mutant strains :type plates_mt: `list` :param dnp_mtc: Computational profiles of mutant strains. :type dnp_mtc: `numpy.array` :param tot_mean: Total mean distance between MT and WT strains. :type tot_mean: `float` :param out_dir: Full path to output directory where plot will be saved. :type out_dir: `str` """ N = 100 S = 100 plates_idx = {(el[-1], el[1], el[3], el[4]): i for i, el in enumerate(plates_mt)} for mt in mut: orf = mt[0] orf_dist = mt[1] print "Estimating null distribution and plotting for mutant %s" % orf null_distr = [] for _ in xrange(N): idx_arr_mt = np.arange(len(dnp_mtc)) np.random.shuffle(idx_arr_mt) idxs = [(orf, el[0], el[2], el[3]) for el in mt[2]] profiles = [dnp_mtc[plates_idx[idx], 1:] for idx in idxs] Y = dnp_mtc[idx_arr_mt[:S], 1:] ds = euclidean_distances(np.vstack(profiles), Y) null_distr.append(np.mean(np.mean(ds), axis = 0)) fig = plt.figure(figsize = (10,8)) ax = fig.add_subplot(111) ax.hist(null_distr, 5, facecolor = "green", alpha = 0.75, histtype = "bar", label = "Distance of mutant %s to %d random sets (S=%d) of mutants." % (orf, N, S)) ax.plot([tot_mean]*2, [0, 30], linestyle = "--", c = "black", linewidth = 2, label = "Total mean distance between MT and WT strains") ax.plot(orf_dist, 1, "ko") ax.annotate("%s, %5.3f" % (orf, orf_dist) , xy=(orf_dist, 1), xytext=(-20,20), textcoords='offset points', ha='center', va='bottom', bbox=dict(boxstyle='round,pad=0.2', fc='yellow', alpha=0.3), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.5', color='red')) ax.set_xlabel("Distance") ax.set_title("Mutant %s" % orf) ax.grid(True) plt.xlim(0, max(orf_dist+5, np.max(null_distr)+5)) plt.legend(prop = {"size":8}) fname = out_dir + "hist_signif_%s_%d_%d.pdf" % (orf, N, S) plt.savefig(fname) plt.close(fig)