Source code for sa.mds

"""
################
MDS (``sa.mds``)
################

Multi dimensional scaling.

"""

# author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
# Licence: BSD

import numpy as np

import warnings

from sklearn.base import BaseEstimator
from sklearn.metrics import euclidean_distances
from sklearn.utils import check_random_state
from sklearn.externals.joblib import Parallel
from sklearn.externals.joblib import delayed


[docs]def pool_adjacent_violators(distances, similarities, max_iter=300,
                            verbose=0):
    """
    Pool adjancent violators

    Computes an isotonic regression of distances on similarities.

    Parameters
    ----------
    distances: ndarray, shape (n, 1)
        array to fit

    similarities: ndarray, shape (n, 1)
        array on which to fit

    max_iter: int, optional, default:300
        Set the maximum number of iteration

    verbose: int, optional, default: 0
        set the level of verbosity

    Returns
    -------
    distances: ndarray, shape (n, 1)

    Notes
    -----
    "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
    Groenen P. Springer Series in Statistics (1997)
    """
    if verbose == 2:
        print "In PAV"
    # First approach for ties: ignore them. The multidimensional scaling won't
    # enforce that points with equal similarity be at equal distance.
    indxs = np.lexsort((distances, similarities))

    new_blocks = range(len(indxs))

    block = []
    sort = True
    for it in range(max_iter):
        sort = False
        blocks = new_blocks[:]
        new_blocks = []
        block = []
        dis = distances[indxs[blocks[:-1]]] <= distances[indxs[blocks[1:]]] + \
             np.finfo(np.float).resolution
        for i, element in enumerate(dis):
            if not element:
                sort = True
                block.append(blocks[i])
            elif element and block:
                tmp = np.arange(block[0], blocks[i + 1])
                distances[indxs[tmp]] = distances[indxs[tmp]].mean()
                new_blocks.append(block[0])
                block = []
            else:
                new_blocks.append(blocks[i])
        # The last element
        if block:
            tmp = np.arange(block[0], len(similarities))
            distances[indxs[tmp]] = distances[indxs[tmp]].mean()
            new_blocks.append(block[0])
        else:
            new_blocks.append(len(similarities) - 1)
        if not sort:
            if verbose:
                print "Breaking at iteration %d" % it
            break

    return distances


def _smacof_single(similarities, metric=True, n_components=2, init=None,
           max_iter=300, verbose=0, eps=1e-3, random_state=None):
    """
    Computes multidimensional scaling using SMACOF algorithm

    Parameters
    ----------
    similarities: symmetric ndarray, shape [n * n]
        similarities between the points

    metric: boolean, optional, default: True
        compute metric or nonmetric SMACOF algorithm

    n_components: int, optional, default: 2
        number of dimension in which to immerse the similarities
        overwritten if initial array is provided.

    init: {None or ndarray}
        if None, randomly chooses the initial configuration
        if ndarray, initialize the SMACOF algorithm with this array

    max_iter: int, optional, default: 300
        Maximum number of iterations of the SMACOF algorithm for a single run

    verbose: int, optional, default: 0
        level of verbosity

    eps: float, optional, default: 1e-6
        relative tolerance w.r.t stress to declare converge

    random_state: integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    Returns
    -------
    X: ndarray (n_samples, n_components), float
               coordinates of the n_samples points in a n_components-space

    stress_: float
        The final value of the stress (sum of squared distance of the
        disparities and the distances for all constrained points)

    """
    if verbose == 2:
        print "In SMACOF"
    n_samples = similarities.shape[0]
    random_state = check_random_state(random_state)

    if similarities.shape[0] != similarities.shape[1]:
        raise ValueError("similarities must be a square array (shape=%d)" % \
                            n_samples)
    res = 100 * np.finfo(np.float).resolution
    if np.any((similarities - similarities.T) > res):
        raise ValueError("similarities must be symmetric")

    sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel()
    sim_flat_w = sim_flat[sim_flat != 0]
    if init is None:
        # Randomly choose initial configuration
        X = random_state.rand(n_samples * n_components)
        X = X.reshape((n_samples, n_components))
    else:
        # overrides the parameter p
        n_components = init.shape[1]
        if n_samples != init.shape[0]:
            raise ValueError("init matrix should be of shape (%d, %d)" % \
                                 (n_samples, n_components))
        X = init

    old_stress = None
    for it in range(max_iter):
        # Compute distance and monotonic regression
        dis = euclidean_distances(X)

        if metric:
            disparities = similarities
        else:
            dis_flat = dis.ravel()
            # similarities with 0 are considered as missing values
            dis_flat_w = dis_flat[sim_flat != 0]

            # Compute the disparities using a monotonic regression
            disparities_flat = pool_adjacent_violators(dis_flat_w,
                                                       sim_flat_w)
            disparities = dis_flat.copy()
            disparities[sim_flat != 0] = disparities_flat
            disparities = disparities.reshape((n_samples, n_samples))
            disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / \
                           (disparities ** 2).sum())

        # Compute stress
        stress = ((dis.ravel() - \
                    disparities.ravel()) ** 2).sum() / 2

        # Update X using the Guttman transform
        dis[dis == 0] = 1e-5
        ratio = disparities / dis
        B = - ratio
        B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
        X = 1. / n_samples * np.dot(B, X)

        dis = np.sqrt((X ** 2).sum(axis=1)).sum()
        if verbose == 2:
            print 'it: %d, stress %s' % (it, stress)
        if old_stress is not None:
            if(old_stress - stress / dis) < eps:
                if verbose:
                    print 'breaking at iteration %d with stress %s' % (it,
                                                                       stress)
                break
        old_stress = stress / dis

    return X, stress


[docs]def smacof(similarities, metric=True, n_components=2, init=None, n_init=8,
           n_jobs=1, max_iter=300, verbose=0, eps=1e-3, random_state=None):
    """
    Computes multidimensional scaling using SMACOF (Scaling by Majorizing a
    Complicated Function) algorithm

    The SMACOF algorithm is a multidimensional scaling algorithm: it minimizes
    a objective function, the *stress*, using a majorization technique. The
    Stress Majorization, also known as the Guttman Transform, guarantees a
    monotone convergence of Stress, and is more powerful than traditional
    technics such as gradient descent.

    The SMACOF algorithm for metric MDS can summarized by the following steps:

    1. Set an initial start configuration, randomly or not.
    2. Compute the stress
    3. Compute the Guttman Transform
    4. Iterate 2 and 3 until convergence.

    The nonmetric algorithm adds a monotonic regression steps before computing
    the stress.

    Parameters
    ----------
    similarities : symmetric ndarray, shape (n_samples, n_samples)
        similarities between the points

    metric : boolean, optional, default: True
        compute metric or nonmetric SMACOF algorithm

    n_components : int, optional, default: 2
        number of dimension in which to immerse the similarities
        overridden if initial array is provided.

    init : {None or ndarray of shape (n_samples, n_components)}
        if None, randomly chooses the initial configuration
        if ndarray, initialize the SMACOF algorithm with this array

    n_init : int, optional, default: 8
        Number of time the smacof algorithm will be run with different
        initialisation. The final results will be the best output of the
        n_init consecutive runs in terms of stress.

    n_jobs : int, optional, default: 1

        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.

        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debuging. For n_jobs below -1,
        (n_cpus + 1 - n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    max_iter : int, optional, default: 300
        Maximum number of iterations of the SMACOF algorithm for a single run

    verbose : int, optional, default: 0
        level of verbosity

    eps : float, optional, default: 1e-6
        relative tolerance w.r.t stress to declare converge

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    Returns
    -------
    X : ndarray (n_samples,n_components)
        Coordinates of the n_samples points in a n_components-space

    stress : float
        The final value of the stress (sum of squared distance of the
        disparities and the distances for all constrained points)

    Notes
    -----
    "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
    Groenen P. Springer Series in Statistics (1997)

    "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
    Psychometrika, 29 (1964)

    "Multidimensional scaling by optimizing goodness of fit to a nonmetric
    hypothesis" Kruskal, J. Psychometrika, 29, (1964)
    """

    random_state = check_random_state(random_state)

    if hasattr(init, '__array__'):
        init = np.asarray(init).copy()
        if not n_init == 1:
            warnings.warn(
                'Explicit initial positions passed: '
                'performing only one init of the MDS instead of %d'
                % n_init)
            n_init = 1

    best_pos, best_stress = None, None

    if n_jobs == 1:
        for it in range(n_init):
            pos, stress = _smacof_single(similarities, metric=metric,
                                        n_components=n_components,
                                        init=init, max_iter=max_iter,
                                        verbose=verbose, eps=eps,
                                        random_state=random_state)
            if best_stress is None or stress < best_stress:
                best_stress = stress
                best_pos = pos.copy()
    else:
        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
        results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
            delayed(_smacof_single)(
                        similarities, metric=metric, n_components=n_components,
                        init=init, max_iter=max_iter,
                        verbose=verbose, eps=eps,
                        random_state=seed)
                for seed in seeds)
        positions, stress = zip(results)
        best = np.argmin(stress)
        best_stress = stress[best]
        best_pos = positions[best]
    return best_pos, best_stress


[docs]class MDS(BaseEstimator):
    """
    Multidimensional scaling

    Parameters
    ----------
    metric : boolean, optional, default: True
        compute metric or nonmetric SMACOF (Scaling by Majorizing a
        Complicated Function) algorithm

    n_components : int, optional, default: 2
        number of dimension in which to immerse the similarities
        overridden if initial array is provided.

    n_init : int, optional, default: 4
        Number of time the smacof algorithm will be run with different
        initialisation. The final results will be the best output of the
        n_init consecutive runs in terms of stress.

    max_iter : int, optional, default: 300
        Maximum number of iterations of the SMACOF algorithm for a single run

    verbose : int, optional, default: 0
        level of verbosity

    eps : float, optional, default: 1e-6
        relative tolerance w.r.t stress to declare converge

    n_jobs : int, optional, default: 1
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.

        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debuging. For n_jobs below -1,
        (n_cpus + 1 - n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.


    Attributes
    ----------
    ``embedding_`` : array-like, shape [n_components, n_samples]
        Stores the position of the dataset in the embedding space

    ``stress_`` : float
        The final value of the stress (sum of squared distance of the
        disparities and the distances for all constrained points)


    Notes
    -----
    "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
    Groenen P. Springer Series in Statistics (1997)

    "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
    Psychometrika, 29 (1964)

    "Multidimensional scaling by optimizing goodness of fit to a nonmetric
    hypothesis" Kruskal, J. Psychometrika, 29, (1964)

    """
    def __init__(self, n_components=2, metric=True, n_init=4,
                 max_iter=300, verbose=0, eps=1e-3, n_jobs=1,
                 random_state=None):
        self.n_components = n_components
        self.metric = metric
        self.n_init = n_init
        self.max_iter = max_iter
        self.eps = eps
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.random_state = None

[docs]    def fit(self, X, init=None, y=None):
        """
        Computes the position of the points in the embedding space

        Parameters
        ----------
        X: array, shape=[n_samples, n_samples], symetric
            Proximity matrice

        init: {None or ndarray, shape (n_samples,)}
            if None, randomly chooses the initial configuration
            if ndarray, initialize the SMACOF algorithm with this array
        """
        self.embedding_, self.stress_ = smacof(X, metric=self.metric,
                                     n_components=self.n_components,
                                     init=init,
                                     n_init=self.n_init,
                                     max_iter=self.max_iter,
                                     verbose=self.verbose,
                                     eps=self.eps,
                                     random_state=self.random_state)
        return self

[docs]    def fit_transform(self, X, init=None, y=None):
        """
        Fit the data from X, and returns the embedded coordinates

        Parameters
        ----------
        X: array, shape=[n_samples, n_samples], symetric
            Proximity matrice

        init: {None or ndarray, shape (n_samples,)}
            if None, randomly chooses the initial configuration
            if ndarray, initialize the SMACOF algorithm with this array

        """
        self.embedding_, self.stress_ = smacof(X, metric=self.metric,
                                     n_components=self.n_components,
                                     init=init,
                                     n_init=self.n_init,
                                     max_iter=self.max_iter,
                                     verbose=self.verbose,
                                     eps=self.eps)

        return self.embedding_
Navigation

Source code for sa.mds

Quick search

Navigation