Source code for sa.analysis

"""
##########################
analysis (``sa.analysis``)
##########################

This script performs statistical analysis of strains inside plate and in-between plates.

"""
import numpy as np
import sklearn
from collections import defaultdict
from operator import itemgetter
from sklearn.metrics.pairwise import euclidean_distances

import utilities as utils
import plotting
import methods

#For new wild-type ORFs, just add them to this list
WT_ORFs = ["YOR202W"]
    
[docs]def strains_1p_WT(meta, plate, res_path, plot_attr_hist = True):
    """
    Analyze WT strains from one plate. First, filter out MT strains, e.g. retain strains with ORF equal to YOR202W. 
    Continue by standardizing the features and detecting outliers using elliptic envelope method. Detected outliers are saved
    to file named `<plate-title>_outliers.csv`. 
    
    Euclidean distances are computed between WT strains and plotted as heat map. Additionally, mean distances between
    all strains (WT strains without outliers and MT strains) are plotted in a heat map as located on the plate. 
    
    WT strains are clustered to reveal possible structures and assess their homogeneity. Also, the intersection between clusters and
    WT outliers is printed to the screen. PCA is computed in explained variance is printed. 
    
    Standardized plate data are saved in Orange and CSV format. 
    
    :param meta: Meta data for one plate, `(file_name, attr_names)`, as returned from :mod:`utilities.read`.
    :type meta: `tuple`
    :param plate: Plate data as returned from :mod:`utilities.read`.
    :type plate: `list`
    :param res_path: Full path to the directory where results are to be saved.  
    :type res_path: `str`
    
    .. seealso:: See also functions :func:`sa.methods.standardize`, :func:`sa.plotting.plot_hist_with_norm_fit`, 
                 :func:`sa.methods.detect_outliers`, :func:`sa.plotting.plot_plate_by_mean_well_distance`, 
                 :func:`sa.methods.k_means`, :func:`sa.methods.outlier2cluster`, :func:`sa.methods.decompose_PCA`,
                 :func:`sa.utilities.to_orange` and :func:`sa.utilities.to_csv`.
    """
    file_name, attrs = meta
    out_name = res_path + file_name[:-4]
    dataf = utils.filter_attribute(attrs, plate, attr_name = "ORF", attr_values = WT_ORFs)
    dnpf = utils.data2np(dataf, skip_first = 5, skip_last = 1)
    if plot_attr_hist:
        for i in xrange(dnpf.shape[1]):
            plotting.plot_hist_with_norm_fit(dnpf[:, i], attrs[i], title = file_name, out_name = out_name + "_" + attrs[i])
    dnp = methods.standardize(dnpf)
    utils.pp_plate_info(meta, plate)

    #analysis
    en_out, r_data = methods.detect_outliers(dnp, dataf, out_name = "%s%s_outliers.csv" % (res_path, file_name[:-4]), save = True)
    r_dataf, r_dnp = r_data
    #if you have outliers in data set, normalizing data will scale normal data to a very small interval
    #dnp = methods.normalize(r_dnp) 
    dist_matrix = methods.compute_distance(r_dnp, title = file_name, out_name = out_name, labels = tuple(r_dataf[i][0] for i in range(r_dnp.shape[0])))
    
    
    wt, mt = utils.split_WT_MT(meta, plate, wt_mt_name = "ORF", wt_name = WT_ORFs)
    #WT
    dnp_wt = utils.data2np(wt, skip_first = 5, skip_last = 1)
    dnp_wt = methods.standardize(dnp_wt)
    _, (plate_wtr, dnp_wtr) = methods.detect_outliers(dnp_wt, wt, out_name = None, save = False)
    #MUTANTS
    dnp_mt = utils.data2np(mt, skip_first = 5, skip_last = 1)
    dnp_mt = methods.standardize(dnp_mt)
    dnp_os = np.vstack((dnp_wtr, dnp_mt))
    plate_os = plate_wtr + mt
    
    dist_matrix_os = euclidean_distances(dnp_os, dnp_os)
    mw_matrix = plotting.plot_plate_by_mean_well_distance(dist_matrix_os, title = file_name, out_name = out_name, plate = plate_os)
    
    pred_kmeans, score_kmeans, trans_kmeans = methods.k_means(dnp, dataf, k_range = range(2,6), out_name_silhouette = out_name, 
                                                              out_dir_predictions = res_path, out_name_predictions = file_name[:-4],
                                                              save_silhouette = True, save_predictions = True, wt_name = WT_ORFs)
    
    methods.outlier2cluster(en_out[0], pred_kmeans)
    pca_trans = methods.decompose_PCA(r_dnp, r_dataf, n_components = 3, title = file_name, out_name = out_name)
    #save data used in analysis in Orange format
    utils.to_orange(attrs, plate_os, dnp_os, out_name = out_name, noncomp_first = 5, noncomp_last = 1)
    #save data used in analysis in CSV format 
    utils.to_csv(attrs, plate_os, dnp_os, out_name = out_name, noncomp_first = 5, noncomp_last = 1)
 
[docs]def strains_Np_WT(meta, plates, res_path):
    """
    Analyze WT strains from many plates by combining them in one set.
    
    :param meta: Meta data, `[(file_name1, attr_names1), (file_name2 attr_names2) ...]`, as returned from :mod:`utilities.read`.
    :type meta: `list`
    :param plates: Plates data as returned from :mod:`utilities.read`.
    :type plates: `list`
    :param res_path: Full path to the directory where results are to be saved. 
    :type res_path: `str`
    
    .. seealso:: See also functions :func:`sa.methods.standardize`, :func:`sa.methods.detect_outliers`, 
             :func:`sa.methods.k_means`, :func:`sa.methods.outlier2cluster`, :func:`sa.methods.decompose_PCA`,
             :func:`sa.utilities.to_orange` and :func:`sa.utilities.to_csv`.
    """
    print 
    print "==== Joined standardization and outlier removal for all plates ===="
    print 
    file_nameL, attrsL = zip(*meta)
    metaj, platej = utils.combine(meta, plates)
    file_name, attrs = metaj
    out_name = res_path + file_name
    
    dataf = utils.filter_attribute(attrs, platej, attr_name = "ORF", attr_values = WT_ORFs)
    dnpf = utils.data2np(dataf, skip_first = 5, skip_last = 1)
    dnp = methods.standardize(dnpf)
    
    #combined analysis as one plate
    en_out, r_data = methods.detect_outliers(dnp, dataf, out_name = "%scombined_outliers.csv" % res_path, save = True)
    r_dataf, r_dnp = r_data
    #if you have outliers in data set, normalizing data will scale normal data to a very small interval
    #dnp = methods.normalize(dnp)
    dist_matrix = methods.compute_distance(r_dnp, title = file_name + " (joined std., o.r.)", out_name = out_name + "__joined_std_out", labels = tuple(r_dataf[i][0] for i in range(len(r_dataf))))
    pred_kmeans, score_kmeans, trans_kmeans = methods.k_means(dnp, dataf, k_range = range(2,6), out_name_silhouette = out_name,
                                                              out_dir_predictions = res_path, out_name_predictions = "combined",
                                                              save_silhouette = True, save_predictions = True, wt_name = WT_ORFs)
    methods.outlier2cluster(en_out[0], pred_kmeans)
    pca_trans = methods.decompose_PCA(r_dnp, r_dataf, n_components = 3, title = file_name, out_name = out_name)
    
    #separate standardization and outlier removal for each plate
    print 
    print "==== Separate standardization and outlier removal for each plate ===="
    print
    dnp_all, plate_all = [], []
    for smeta, splate in zip(meta, plates):
        wt, mt = utils.split_WT_MT(smeta, splate, wt_mt_name = "ORF", wt_name = WT_ORFs)
        #WT
        dnp_wt = utils.data2np(wt, skip_first = 5, skip_last = 1)
        dnp_wt = methods.standardize(dnp_wt)
        _, (plate_wtr, dnp_wtr) = methods.detect_outliers(dnp_wt, wt, out_name = None, save = False)
        #MUTANTS
        dnp_mt = utils.data2np(mt, skip_first = 5, skip_last = 1)
        dnp_mt = methods.standardize(dnp_mt)
        #save
        plate_all.extend(plate_wtr)
        plate_all.extend(mt)    
        dnp_all.extend(dnp_wtr)
        dnp_all.extend(dnp_mt) 
    dist_matrix = methods.compute_distance(np.array(dnp_all), title = file_name + " (per plate std., o.r.)", out_name = out_name + "__separate_std_out", labels = tuple(r_dataf[i][0] for i in range(len(r_dataf))))
    #save data used in analysis in Orange format
    utils.to_orange(attrs, plate_all, np.array(dnp_all), out_name = out_name, noncomp_first = 5, noncomp_last = 1) 
    #save data used in analysis in CSV format
    utils.to_csv(attrs, plate_all, np.array(dnp_all), out_name = out_name, noncomp_first = 5, noncomp_last = 1)
    
[docs]def strains_Np_MT(data_del, data_ts, data_sg, repeats_path, res_path, repeats_keys = ["RT", "37"], standardize = True):
    """
    Find mutants with significantly different profiles than wild-type cells by estimating
    distance between WT strains, MT strains, WT and MT strains and assess significance with 
    permutation test. 
    
    :param data_del: Deletion collection plates data as returned from :mod:`utilities.read`.
    :type data_del: `tuple` (meta_data, plates_data)
    :param data_ts: TS collection plates data as returned from :mod:`utilities.read`.
    :type data_ts: `tuple` (meta_data, plates_data)
    :param data_sg: SG collection plates data as returned from :mod:`utilities.read`.
    :type data_sg: `tuple` (meta_data, plates_data)
    :param repeats_path: Full path to file with multi-occuring mutant strains specification.
    :type repeats_path: `str`
    :param res_path: Full path to the directory where results are to be saved. 
    :type res_path: `str`
    :param repeats_keys: Names of TS (temperature sensitive mutant strains) plates' extensions. By default these are ["RT", "37"].
    :type repeats_keys: `list`
    :param standardize: Indicator whether to work with standardized or original features. By default, data set is standardized. 
    :type standardize: `bool`
    
    .. seealso:: See also functions :func:`sa.plotting.plot_hist_mean_MT_WT_distance`, :func:`sa.plotting.plot_hist_WT_WT_distance`, 
                 :func:`sa.plotting.plot_hist_mean_MT_WT__WT_WT_distance` and :func:`sa.plotting.plot_hist_signif_MT_WT`.
    """
    plates_wt, dnp_wtc = [], []
    plates_mt, dnp_mtc = [], []
    for coll in [data_del, data_ts, data_sg]:
        for meta, plate in zip(coll[0], coll[1]):
            wt, mt = utils.split_WT_MT(meta, plate, wt_mt_name = "ORF", wt_name = WT_ORFs)
            #WT
            dnp_wt = utils.data2np(wt, skip_first = 5, skip_last = 1)
            if standardize:
                dnp_wt = methods.standardize(dnp_wt)
            _, (plate_wtr, dnp_wtr) = methods.detect_outliers(dnp_wt, wt, out_name = None, save = False)
            #MUTANTS
            dnp_mt = utils.data2np(mt, skip_first = 5, skip_last = 1)
            if standardize: 
                dnp_mt = methods.standardize(dnp_mt)
            #save
            plates_wt.extend(plate_wtr)
            dnp_wtc.extend(dnp_wtr)
            plates_mt.extend(mt)
            dnp_mtc.extend(dnp_mt)    
    
    dnp_mtc = np.array(dnp_mtc)
    dnp_wtc = np.array(dnp_wtc)
    print "No. observations of mutant strains: %d" % len(plates_mt)
    print "No. observations of wild-type strains: %d" % len(plates_wt)
    
    #Compute and save Euclidean distances between mutant and wild-type strains to file.
    print "Computing Euclidean distances between mutant and wild-type strains."
    dist_mat = euclidean_distances(dnp_mtc, dnp_wtc)
    #mean distance of each mutant to all wild-type strains
    m_mt_wt = np.mean(dist_mat, axis = 1)
    print "Mean distance between mutant strain and wild-type: %5.3f" % np.mean(m_mt_wt)
    assert len(plates_mt) == len(m_mt_wt), "Dimension mismatch between mutant computational and meta features."
    
    #Construct representation of mutant meta data and their distances. Average replication unless from TS collection
    _, reps = utils.read_repl(repeats_path, repeats_keys)
    repsd = defaultdict(list)
    for orf, pn, r, c in reps:
        repsd[orf].append((pn, r, c))
        
    dstd = {}
    for i in xrange(len(plates_mt)):  
        pn, date, r, c = plates_mt[i][1:5]
        orf = plates_mt[i][-1]
        dstd[(orf, pn, r, c)] = m_mt_wt[i]
        
    mutd = defaultdict(list)
    for i in xrange(len(plates_mt)):
        orf = plates_mt[i][-1]
        pn, date, r, c = plates_mt[i][1:5]
        mutd[orf].append((orf, dstd[(orf, pn, r, c)], (pn, date, r, c)))

    mut = []
    for orf, vals in mutd.iteritems():
        if len(vals) == 1:
            # (orf, mean_dist, [(pn1, date1, r1, c1, dist1), (pn2, date2, r2, c2, dist1), ...])
            mut.append((orf, vals[0][1], [vals[0][2]]))
        else:
            valsc = []
            for val in vals:
                TS_in = sum([(k in val[2][0]) for k in repeats_keys]) > 0
                if TS_in:
                    mut.append((orf, val[1], [val[2]]))
                else: valsc.append(val)
            if valsc:
                mean = np.mean([ el[1] for el in valsc])
                mut.append((orf, mean, [(el[2][0], el[2][1], el[2][2], el[2][3], el[1]) for el in valsc]))
    
    mut.sort(reverse=True, key = itemgetter(1))
    
    #Save computed distances between mutants and WT to file
    out_name = res_path + "MT-WT_distance"
    f = open(out_name + ".csv", "wt")
    f.write("%s\n" % ",".join(["ORF", "distance", "meta"]))    
    for el in mut:
        orf, meand, repsm = el
        f.write("%s\n" % (",".join([orf, str(meand), ",".join(str(el).replace("'", "") for el in repsm) ])))
    f.close()
    print "Mean mutant distances to WT saved to: %s" % out_name
    
    #Save processed MT profiles to csv file
    plates_mt_d = {(p[-1], p[1], p[2], p[3], p[4]): i for i, p in enumerate(plates_mt)}
    attrs = data_del[0][0][1]
    data_org, dnp = [], []
    for el in mut: 
        orf, _, repsm = el
        for rep in repsm:
            pl, dt, r, c = rep[:4]
            data_org.append(plates_mt[plates_mt_d[(orf, pl, dt, r, c)]])
            dnp.append(dnp_mtc[plates_mt_d[(orf, pl, dt, r, c)]])
    utils.to_csv(attrs, data_org, np.array(dnp), out_name = out_name, noncomp_first = 5, noncomp_last = 1)
    
    dm_wt_wt = euclidean_distances(dnp_wtc, dnp_wtc)
    wt_wt = np.tril(dm_wt_wt, -1)
    wt_wt = wt_wt[np.nonzero(wt_wt)]  
    m_wt_wt = [np.mean(np.hstack((dm_wt_wt[i, :i], dm_wt_wt[i, i+1:]))) for i in xrange(dm_wt_wt.shape[0])]
    
    #Save computed distances between WT and WT to file
    out_name = res_path + "WT-WT_distance.csv"
    f = open(out_name, "wt")
    f.write("%s\n" % ",".join(["ORF", "plate", "date", "row", "col", "distance"]))
    wt_sort = [(i, ds) for i, ds in enumerate(m_wt_wt)]
    wt_sort.sort(reverse = True, key = itemgetter(1))    
    for i, dist in wt_sort:
        orf = plates_wt[i][-1] 
        f.write("%s,%5.4f\n" % (",".join([orf] + plates_wt[i][1:5]), dist))
    f.close()
    print "Mean WT distances to WT saved to: %s" % out_name 
    
    #Save processed MT and WT profiles to csv file
    mt_wt_sort = [("WT", i, ds) for (i, ds) in wt_sort] + [("MT", orf, mean, lst) for (orf, mean, lst) in mut]
    mt_wt_sort.sort(reverse = True, key = itemgetter(2))
    attrs = data_del[0][0][1]
    data_org, dnp = [], []
    for el in mt_wt_sort: 
        if el[0] == "WT":
            _, i, ds = el
            data_org.append(plates_wt[i])
            dnp.append(dnp_wtc[i])
        elif el[0] == "MT":
            _, orf, _, repsm = el
            for rep in repsm:
                pl, dt, r, c = rep[:4]
                data_org.append(plates_mt[plates_mt_d[(orf, pl, dt, r, c)]])
                dnp.append(dnp_mtc[plates_mt_d[(orf, pl, dt, r, c)]])
    utils.to_csv(attrs, data_org, np.array(dnp), out_name = res_path + "WT_MT_by_distance", noncomp_first = 5, noncomp_last = 1)
    
    plotting.plot_hist_mean_MT_WT_distance([el[1] for el in mut], res_path)  
    plotting.plot_hist_WT_WT_distance(wt_wt, res_path)
    plotting.plot_hist_mean_MT_WT__WT_WT_distance([el[1] for el in mut], m_wt_wt, res_path)
    plotting.plot_hist_signif_MT_WT(mut, plates_mt, dnp_mtc, np.mean([el[1] for el in mut]), res_path)   
    
[docs]def strains_coll(data_del, data_ts, data_sg, res_path):
    """
    Preprocess plates from each collection (standardizing features and remove outlier WT strains)
    and compute distances between strains from the same and different collections. Histograms
    of distances between collections are saved to directory :param:`res_path`.
    
    :param data_del: Deletion collection plates data as returned from :mod:`utilities.read`.
    :type data_del: `tuple` (meta_data, plates_data)
    :param data_ts: TS collection plates data as returned from :mod:`utilities.read`.
    :type data_ts: `tuple` (meta_data, plates_data)
    :param data_sg: SG collection plates data as returned from :mod:`utilities.read`.
    :type data_sg: `tuple` (meta_data, plates_data)
    :param res_path: Full path to the directory where results are to be saved.
    :type res_path: `str`
    
    .. seealso:: See also function :func:`sa.plotting.plot_hist_coll`.
    """
    dnps = []
    for coll_name, coll in [("DEL", data_del), ("TS", data_ts), ("SG", data_sg)]:
        print "Preprocessing %s collection" % coll_name
        dnp = []
        
        for meta, plate in zip(coll[0], coll[1]):
            wt, mt = utils.split_WT_MT(meta, plate, wt_mt_name = "ORF", wt_name = WT_ORFs)
            #WT
            dnp_wt = utils.data2np(wt, skip_first = 5, skip_last = 1)
            dnp_wt = methods.standardize(dnp_wt)
            _, (plate_wtr, dnp_wtr) = methods.detect_outliers(dnp_wt, wt, out_name = None, save = False)
            #MUTANTS
            dnp_mt = utils.data2np(mt, skip_first = 5, skip_last = 1)
            dnp_mt = methods.standardize(dnp_mt)
            #save
            dnp.extend(dnp_wtr)
            dnp.extend(dnp_mt)
            
        dnps.append(np.array(dnp))
    
    plotting.plot_hist_coll([dnps[0], dnps[2], dnps[1]], ["Del", "SG", "TS"], out_dir = res_path)
    plotting.plot_hist_coll([dnps[2], dnps[0], dnps[1]], ["SG", "Del", "TS"], out_dir = res_path)
    plotting.plot_hist_coll([dnps[1], dnps[2], dnps[0]], ["TS", "SG", "Del"], out_dir = res_path)
    
[docs]def strains_repl(data_del, data_ts, data_sg, repeats_path, res_path, repeats_keys = ["RT", "37"]):
    """
    Analyze mutants that occur multiple times in the data set. First standardize data and then analyze 
    distance distribution of replicate observations and all observations.
    
    :param data_del: Deletion collection plates data as returned from :mod:`utilities.read`.
    :type data_del: `tuple` (meta_data, plates_data)
    :param data_ts: TS collection plates data as returned from :mod:`utilities.read`.
    :type data_ts: `tuple` (meta_data, plates_data)
    :param data_sg: SG collection plates data as returned from :mod:`utilities.read`.
    :type data_sg: `tuple` (meta_data, plates_data)
    :param repeats_path: Full path to file with multi-occurring mutants specification.
    :type repeats_path: `str`
    :param res_path: Full path to the directory where results are to be saved.
    :type res_path: `str`
    :param repeats_keys: Names of TS (temperature sensitive mutants) plates' extensions. By default these are ["RT", "37"].
    :type repeats_keys: `list`
    
    .. seealso:: See also function :func:`sa.methods.analyze_repl`.
    """
    c_plates = []
    rc_dnp = []
    for _, coll_plates in [data_del, data_ts, data_sg]:
        for plate in coll_plates:
            dnp = utils.data2np(plate, skip_first = 5, skip_last = 1)
            dnp = methods.standardize(dnp)
            c_plates.extend(plate)
            rc_dnp.extend(dnp)
   
    print "Total number of examples %d" % len(c_plates)
    c_dnp = np.array(rc_dnp)
    methods.analyze_repl(repeats_path, c_plates, c_dnp, res_path, keys = repeats_keys, save_dist_mat = False)   
    
[docs]def fss(data_del, data_ts, data_sg, res_path):
    """
    Feature subset selection for unsupervised learning. Feature subset selection (FSS) and clustering
    based on feature subspace with highest score. A low dimensional representation (MDS) of best 
    clustering is saved to directory :param:`res_path`.
    
    :param data_del: Deletion collection plates data as returned from :mod:`utilities.read`.
    :type data_del: `tuple` (meta_data, plates_data)
    :param data_ts: TS collection plates data as returned from :mod:`utilities.read`.
    :type data_ts: `tuple` (meta_data, plates_data)
    :param data_sg: SG collection plates data as returned from :mod:`utilities.read`.
    :type data_sg: `tuple` (meta_data, plates_data)
    :param res_path: Full path to the directory where results are to be saved.
    :type res_path: `str`
    
    .. seealso:: See also functions :func:`sa.methods.fss_wrapper`, :func:`sa.methods.decompose_MDS` and
                 :func:`sa.utilities.std_prep`. 
    """
    dnp, plates = utils.std_prep(data_del, data_ts, data_sg, res_path, wt_attr_name = "ORF", wt_name = WT_ORFs)
    dnp = dnp[:, 1:]
    attr_names = data_del[0][0][1][6:-1]
    assert dnp.shape[1] == len(attr_names), "The shapes of attribute space and feature names do not match."
    
    pred_best, score_best, attr_best = methods.fss_wrapper(dnp, plates, attr_names, out_dir = res_path)
    _ = methods.decompose_MDS(dnp, pred_best, out_dir = res_path, save_coordinates = True)
    
[docs]def fss_post_cluster(data_del, data_ts, data_sg, fss_subset_path, fss_cluster_path, res_path):
    """
    Read clustering predictions and description of feature subspace. Run MDS optimization
    and save plotted coordinates. 
    
    :param data_del: Deletion collection plates data as returned from :mod:`utilities.read`.
    :type data_del: `tuple` (meta_data, plates_data)
    :param data_ts: TS collection plates data as returned from :mod:`utilities.read`.
    :type data_ts: `tuple` (meta_data, plates_data)
    :param data_sg: SG collection plates data as returned from :mod:`utilities.read`.
    :type data_sg: `tuple` (meta_data, plates_data)
    :param fss_subset_path: Full path to the file with feature space description 
                            as obtained by :mod:`analysis.fss`.
    :type fss_subset_path: `str`
    :param fss_cluster_path: Full path to the file with predictions for observations
                            as obtained by :mod:`analysis.fss`.
    :type fss_cluster_path: `str`
    :param res_path: Full path to the directory where results are to be saved.
    :type res_path: `str`
    
    .. seealso:: See also functions :func:`sa.methods.decompose_MDS` and :func:`sa.utilities.std_prep`. 
    """
    dnp, plates = utils.std_prep(data_del, data_ts, data_sg, res_path, wt_attr_name = "ORF", wt_name = WT_ORFs)
    dnp = dnp[:, 1:]
    attr_names = data_del[0][0][1][6:-1]
    assert dnp.shape[1] == len(attr_names), "The shapes of attribute space and feature names do not match."
    
    f_subset = open(fss_subset_path, "r")
    fs = f_subset.readline().strip().split(",")[1:]
    f_subset.close()
    print "Feature subspace size: %d" % len(fs)
    print "Feature subspace: %s" % ", ".join(fs)
    
    f_cluster = open(fss_cluster_path, "r")
    pred = [line.strip().split(",") for line in f_cluster]
    f_cluster.close()
    
    attr2idx = {attr:i for i, attr in enumerate(attr_names)}
    fs_idx = [attr2idx[attr] for attr in fs]
    
    plt2idx = {(el[1], el[3], el[4]): i for i, el in enumerate(plates)}
    c_labels = np.ones(dnp.shape[0]) * -1
    for el in pred:
        plt, r, c = el[1], el[3], el[4]
        if (plt, r, c) in plt2idx:
            c_labels[plt2idx[(plt, r, c)]] = int(el[5]) 
    
    dnp_fss = dnp[:, fs_idx]
    assert dnp_fss.shape[1] == len(fs), "Dimension mismatch."
    _ = methods.decompose_MDS(dnp_fss, c_labels, out_dir = res_path, save_coordinates = True)
 
[docs]def strains_Np_novelty_MT(data_del, data_ts, data_sg, res_path):
    """
    Find mutants with significantly different profiles than wild-type cells by novelty 
    detection using one-class SVM and GMM.
    
    :param data_del: Deletion collection plates data as returned from :mod:`utilities.read`.
    :type data_del: `tuple` (meta_data, plates_data)
    :param data_ts: TS collection plates data as returned from :mod:`utilities.read`.
    :type data_ts: `tuple` (meta_data, plates_data)
    :param data_sg: SG collection plates data as returned from :mod:`utilities.read`.
    :type data_sg: `tuple` (meta_data, plates_data)
    :param res_path: Full path to the directory where results are to be saved.
    :type res_path: `str`
    
    .. seealso:: See also functions :func:`sa.methods.detect_novelties_SVM` and :func:`sa.methods.detect_novelties_GMM`. 
    """
    plates_wt, dnp_wtc = [], []
    plates_mt, dnp_mtc = [], []
    for coll in [data_del, data_ts, data_sg]:
        for meta, plate in zip(coll[0], coll[1]):
            wt, mt = utils.split_WT_MT(meta, plate, wt_mt_name = "ORF", wt_name = WT_ORFs)
            #WT
            dnp_wt = utils.data2np(wt, skip_first = 5, skip_last = 1)
            dnp_wt = methods.standardize(dnp_wt)
            _, (plate_wtr, dnp_wtr) = methods.detect_outliers(dnp_wt, wt, out_name = None, save = False)
            #MUTANTS
            dnp_mt = utils.data2np(mt, skip_first = 5, skip_last = 1)
            dnp_mt = methods.standardize(dnp_mt)
            #save
            plates_wt.extend(plate_wtr)
            dnp_wtc.extend(dnp_wtr)
            plates_mt.extend(mt)
            dnp_mtc.extend(dnp_mt)     
    
    dnp_mtc = np.array(dnp_mtc)
    dnp_wtc = np.array(dnp_wtc)
    print "No. observations of mutant strains: %d" % len(plates_mt)
    print "No. observations of wild-type strains: %d" % len(plates_wt)
    
    methods.detect_novelties_SVM(dnp_wtc, dnp_mtc, plates_wt, plates_mt, res_path, save_visualization = True)
    methods.detect_novelties_GMM(dnp_wtc, dnp_mtc, plates_wt, plates_mt, res_path, save_visualization = True, wt_name = WT_ORFs)
Navigation

Source code for sa.analysis

Quick search

Navigation