1.
markovmodel/PyEMMA · api.py
Match rating: 64.42% · See similar code snippets
python logo
def cluster_mini_batch_kmeans(data=None, k=100, max_iter=10, batch_size=0.2, metric='euclidean',
                              init_strategy='kmeans++', n_jobs=None, chunksize=None, skip=0, clustercenters=None, **kwargs):
    r"""k-means clustering with mini-batch strategy

    Mini-batch k-means is an approximation to k-means which picks a randomly
    selected subset of data points to be updated in each iteration. Usually
    much faster than k-means but will likely deliver a less optimal result.

    Returns
    -------
    kmeans_mini : a :class:`MiniBatchKmeansClustering <pyemma.coordinates.clustering.MiniBatchKmeansClustering>` clustering object
        Object for mini-batch kmeans clustering.
        It holds discrete trajectories and cluster center information.

    See also
    --------
    :func:`kmeans <pyemma.coordinates.kmeans>` : for full k-means clustering


    .. autoclass:: pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering
        :members:
        :undoc-members:

        .. rubric:: Methods

        .. autoautosummary:: pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering
           :methods:

        .. rubric:: Attributes

        .. autoautosummary:: pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering
            :attributes:

    References
    ----------
    .. [1] http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf

    """
    from pyemma.coordinates.clustering.kmeans import MiniBatchKmeansClustering
    res = MiniBatchKmeansClustering(n_clusters=k, max_iter=max_iter, metric=metric, init_strategy=init_strategy,
                                    batch_size=batch_size, n_jobs=n_jobs, skip=skip, clustercenters=clustercenters)
    from pyemma.util.reflection import get_default_args
    cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_mini_batch_kmeans)['chunksize'], **kwargs)
    if data is not None:
        res.estimate(data, chunksize=cs)
    else:
        res.chunksize = chunksize
    return res
2.
daler/metaseq · plotutils.py
Match rating: 62.84% · See similar code snippets
python logo
def new_clustered_sortind(x, k=10, row_key=None, cluster_key=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param row_key:
        Optional function to act as a sort key for sorting rows within
        clusters.  Signature should be `scorefunc(a)` where `a` is a 1-D NumPy
        array.
    :param cluster_key:
        Optional function for sorting clusters.  Signature is `clusterfunc(a)`
        where `a` is a NumPy array containing all rows of `x` for cluster `i`.
        It must return a single value.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if cluster_key:
        # It's easier for calling code to provide something that operates on
        # a cluster level, but here it's converted to work on a label level
        # that looks in to the array `x`.
        def _cluster_key(i):
            return cluster_key(x[labels == i, :])
        sorted_labels = sorted(range(k), key=_cluster_key)
    else:
        # Otherwise just use them as-is.
        sorted_labels = range(k)

    if row_key:
        # Again, easier to provide a function to operate on a row.  But here we
        # need it to accept an index
        def _row_key(i):
            return row_key(x[i, :])

    final_ind = []
    breaks = []
    pos = 0
    for label in sorted_labels:
        # which rows in `x` have this label
        label_inds = np.nonzero(labels == label)[0]
        if row_key:
            label_sort_ind = sorted(label_inds, key=_row_key)
        else:
            label_sort_ind = label_inds
        for li in label_sort_ind:
            final_ind.append(li)
        pos += len(label_inds)
        breaks.append(pos)

    return np.array(final_ind), np.array(breaks)
3.
oscarbranson/latools · clustering.py
Match rating: 62.62% · See similar code snippets
python logo
def cluster_kmeans(data, n_clusters, **kwargs):
    """
    Identify clusters using K - Means algorithm.

    Parameters
    ----------
    data : array_like
        array of size [n_samples, n_features].
    n_clusters : int
        The number of clusters expected in the data.

    Returns
    -------
    dict
        boolean array for each identified cluster.
    """
    km = cl.KMeans(n_clusters, **kwargs)
    kmf = km.fit(data)

    labels = kmf.labels_

    return labels, [np.nan]
4.
markovmodel/PyEMMA · api.py
Match rating: 61.47% · See similar code snippets
python logo
def cluster_kmeans(data=None, k=None, max_iter=10, tolerance=1e-5, stride=1,
                   metric='euclidean', init_strategy='kmeans++', fixed_seed=False,
                   n_jobs=None, chunksize=None, skip=0, keep_data=False, clustercenters=None, **kwargs):
    r"""k-means clustering

    If data is given, it performs a k-means clustering and then assigns the
    data using a Voronoi discretization. It returns a :class:`KmeansClustering <pyemma.coordinates.clustering.KmeansClustering>`
    object that can be used to extract the discretized data sequences, or to
    assign other data points to the same partition. If data is not given, an
    empty :class:`KmeansClustering <pyemma.coordinates.clustering.KmeansClustering>`
    will be created that still needs to be parametrized, e.g. in a :func:`pipeline`.

    Parameters
    ----------
    data: ndarray (T, d) or list of ndarray (T_i, d) or a reader created by :func:`source`
        input data, if available in memory

    k: int
        the number of cluster centers. When not specified (None), min(sqrt(N), 5000) is chosen as default value,
        where N denotes the number of data points

    max_iter : int
        maximum number of iterations before stopping. When not specified (None), min(sqrt(N),5000) is chosen
        as default value, where N denotes the number of data points

    tolerance : float
        stop iteration when the relative change in the cost function

        :math:`C(S) = \sum_{i=1}^{k} \sum_{\mathbf x \in S_i} \left\| \mathbf x - \boldsymbol\mu_i \right\|^2`

        is smaller than tolerance.

    stride : int, optional, default = 1
        If set to 1, all input data will be used for estimation. Note that this
        could cause this calculation to be very slow for large data sets. Since
        molecular dynamics data is usually correlated at short timescales, it
        is often sufficient to estimate transformations at a longer stride.
        Note that the stride option in the get_output() function of the returned
        object is independent, so you can parametrize at a long stride, and
        still map all frames through the transformer.

    metric : str
        metric to use during clustering ('euclidean', 'minRMSD')

    init_strategy : str
        determines if the initial cluster centers are chosen according to the kmeans++-algorithm
        or drawn uniformly distributed from the provided data set

    fixed_seed : bool or (positive) integer
        if set to true, the random seed gets fixed resulting in deterministic behavior; default is false.
        If an integer >= 0 is given, use this to initialize the random generator.

    n_jobs : int or None, default None
        Number of threads to use during assignment of the data.
        If None, all available CPUs will be used.

    chunksize: int, default=None
        Number of data frames to process at once. Choose a higher value here,
        to optimize thread usage and gain processing speed. If None is passed,
        use the default value of the underlying reader/data source. Choose zero to
        disable chunking at all.

    skip : int, default=0
        skip the first initial n frames per trajectory.

    keep_data: boolean, default=False
        if you intend to quickly resume a non-converged kmeans iteration, set this to True.
        Otherwise the linear memory array will have to be re-created. Note that the data will also be deleted,
        if and only if the estimation converged within the given tolerance parameter.

    clustercenters: ndarray (k, dim), default=None
        if passed, the init_strategy is ignored and these centers will be iterated.

    Returns
    -------
    kmeans : a :class:`KmeansClustering <pyemma.coordinates.clustering.KmeansClustering>` clustering object
        Object for kmeans clustering.
        It holds discrete trajectories and cluster center information.


    Examples
    --------

    >>> import numpy as np
    >>> from pyemma.util.contexts import settings
    >>> import pyemma.coordinates as coor
    >>> traj_data = [np.random.random((100, 3)), np.random.random((100,3))]
    >>> with settings(show_progress_bars=False):
    ...     cluster_obj = coor.cluster_kmeans(traj_data, k=20, stride=1)
    ...     cluster_obj.get_output() # doctest: +ELLIPSIS
    [array([...

    .. seealso:: **Theoretical background**: `Wiki page <http://en.wikipedia.org/wiki/K-means_clustering>`_


    .. autoclass:: pyemma.coordinates.clustering.kmeans.KmeansClustering
        :members:
        :undoc-members:

        .. rubric:: Methods

        .. autoautosummary:: pyemma.coordinates.clustering.kmeans.KmeansClustering
           :methods:

        .. rubric:: Attributes

        .. autoautosummary:: pyemma.coordinates.clustering.kmeans.KmeansClustering
            :attributes:

    References
    ----------
    The k-means algorithms was invented in [1]_. The term k-means was
    first used in [2]_.

    .. [1] Steinhaus, H. (1957).
        Sur la division des corps materiels en parties.
        Bull. Acad. Polon. Sci. (in French) 4, 801-804.

    .. [2] MacQueen, J. B. (1967).
        Some Methods for classification and Analysis of Multivariate Observations.
        Proceedings of 5th Berkeley Symposium on Mathematical Statistics and
        Probability 1. University of California Press. pp. 281-297

    """
    from pyemma.coordinates.clustering.kmeans import KmeansClustering
    res = KmeansClustering(n_clusters=k, max_iter=max_iter, metric=metric, tolerance=tolerance,
                           init_strategy=init_strategy, fixed_seed=fixed_seed, n_jobs=n_jobs, skip=skip,
                           keep_data=keep_data, clustercenters=clustercenters, stride=stride)
    from pyemma.util.reflection import get_default_args
    cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_kmeans)['chunksize'], **kwargs)
    if data is not None:
        res.estimate(data, chunksize=cs)
    else:
        res.chunksize = cs
    return res
5.
daler/metaseq · plotutils.py
Match rating: 61.21% · See similar code snippets
python logo
def clustered_sortind(x, k=10, scorefunc=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param scorefunc: Optional function for sorting rows within clusters.  Must
        accept a single argument of a NumPy array.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if not scorefunc:
        def scorefunc(x):
            return x.mean(axis=0).max()

    for label in range(k):
        ind = labels == label
        score = scorefunc(x[ind, :])
        scores[ind] = score

    pos = 0
    breaks = []
    ind = np.argsort(scores)
    for k, g in itertools.groupby(labels[ind]):
        pos += len(list(g))
        breaks.append(pos)

    return ind, breaks
6.
slundberg/shap · kernel.py
Match rating: 58.36% · See similar code snippets
python logo
def kmeans(X, k, round_values=True):
    """ Summarize a dataset with k mean samples weighted by the number of data points they
    each represent.

    Parameters
    ----------
    X : numpy.array or pandas.DataFrame
        Matrix of data samples to summarize (# samples x # features)

    k : int
        Number of means to use for approximation.

    round_values : bool
        For all i, round the ith dimension of each mean sample to match the nearest value
        from X[:,i]. This ensures discrete features always get a valid value.

    Returns
    -------
    DenseData object.
    """

    group_names = [str(i) for i in range(X.shape[1])]
    if str(type(X)).endswith("'pandas.core.frame.DataFrame'>"):
        group_names = X.columns
        X = X.values
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)

    if round_values:
        for i in range(k):
            for j in range(X.shape[1]):
                ind = np.argmin(np.abs(X[:,j] - kmeans.cluster_centers_[i,j]))
                kmeans.cluster_centers_[i,j] = X[ind,j]
    return DenseData(kmeans.cluster_centers_, group_names, None, 1.0*np.bincount(kmeans.labels_))
7.
brainiak/brainiak · tfa.py
Match rating: 58.21% · See similar code snippets
python logo
def init_centers_widths(self, R):
        """Initialize prior of centers and widths

        Returns
        -------

        centers : 2D array, with shape [K, n_dim]
            Prior of factors' centers.

        widths : 1D array, with shape [K, 1]
            Prior of factors' widths.

        """

        kmeans = KMeans(
            init='k-means++',
            n_clusters=self.K,
            n_init=10,
            random_state=100)
        kmeans.fit(R)
        centers = kmeans.cluster_centers_
        widths = self._get_max_sigma(R) * np.ones((self.K, 1))
        return centers, widths
8.
dask/dask-ml · spectral.py
Match rating: 56.61% · See similar code snippets
python logo
def fit(self, X, y=None):
        X = self._check_array(X)
        n_components = self.n_components
        metric = self.affinity
        rng = check_random_state(self.random_state)
        n_clusters = self.n_clusters

        # kmeans for final clustering
        if isinstance(self.assign_labels, six.string_types):
            if self.assign_labels == "kmeans":
                km = KMeans(
                    n_clusters=n_clusters,
                    random_state=draw_seed(rng, np.iinfo("i4").max, dtype="uint"),
                )
            elif self.assign_labels == "sklearn-kmeans":
                km = sklearn.cluster.KMeans(n_clusters=n_clusters, random_state=rng)
            else:
                msg = "Unknown 'assign_labels' {!r}".format(self.assign_labels)
                raise ValueError(msg)
        elif isinstance(self.assign_labels, BaseEstimator):
            km = self.assign_labels
        else:
            raise TypeError(
                "Invalid type {} for 'assign_labels'".format(type(self.assign_labels))
            )

        if self.kmeans_params:
            km.set_params(**self.kmeans_params)

        n = len(X)
        if n <= n_components:
            msg = (
                "'n_components' must be smaller than the number of samples."
                " Got {} components and {} samples".format(n_components, n)
            )
            raise ValueError(msg)

        params = self.kernel_params or {}
        params["gamma"] = self.gamma
        params["degree"] = self.degree
        params["coef0"] = self.coef0

        # indices for our exact / approximate blocks
        inds = np.arange(n)
        keep = rng.choice(inds, n_components, replace=False)
        keep.sort()
        rest = ~np.isin(inds, keep)

        # compute the exact blocks
        # these are done in parallel for dask arrays
        if isinstance(X, da.Array):
            X_keep = X[keep].rechunk(X.shape).persist()
        else:
            X_keep = X[keep]

        X_rest = X[rest]

        A, B = embed(X_keep, X_rest, n_components, metric, params)
        _log_array(logger, A, "A")
        _log_array(logger, B, "B")

        # now the approximation of C
        a = A.sum(0)  # (l,)
        b1 = B.sum(1)  # (l,)
        b2 = B.sum(0)  # (m,)

        # TODO: I think we have some unnecessary delayed wrapping of A here.
        A_inv = da.from_delayed(delayed(pinv)(A), A.shape, A.dtype)

        inner = A_inv.dot(b1)
        d1_si = 1 / da.sqrt(a + b1)

        d2_si = 1 / da.sqrt(b2 + B.T.dot(inner))  # (m,), dask array

        # d1, d2 are diagonal, so we can avoid large matrix multiplies
        # Equivalent to diag(d1_si) @ A @ diag(d1_si)
        A2 = d1_si.reshape(-1, 1) * A * d1_si.reshape(1, -1)  # (n, n)
        _log_array(logger, A2, "A2")
        # A2 = A2.rechunk(A2.shape)
        # Equivalent to diag(d1_si) @ B @ diag(d2_si)
        B2 = da.multiply(da.multiply(d1_si.reshape(-1, 1), B), d2_si.reshape(1, -1))
        _log_array(logger, B2, "B2")

        U_A, S_A, V_A = delayed(svd, pure=True, nout=3)(A2)

        U_A = da.from_delayed(U_A, (n_components, n_components), A2.dtype)
        S_A = da.from_delayed(S_A, (n_components,), A2.dtype)
        V_A = da.from_delayed(V_A, (n_components, n_components), A2.dtype)

        # Eq 16. This is OK when V2 is orthogonal
        V2 = da.sqrt(float(n_components) / n) * da.vstack([A2, B2.T]).dot(
            U_A[:, :n_clusters]
        ).dot(
            da.diag(1.0 / da.sqrt(S_A[:n_clusters]))
        )  # (n, k)
        _log_array(logger, V2, "V2.1")

        if isinstance(B2, da.Array):
            V2 = V2.rechunk((B2.chunks[1][0], n_clusters))
            _log_array(logger, V2, "V2.2")

        # normalize (Eq. 4)
        U2 = (V2.T / da.sqrt((V2 ** 2).sum(1))).T  # (n, k)

        _log_array(logger, U2, "U2.2")

        # Recover original indices
        U2 = _slice_mostly_sorted(U2, keep, rest, inds)  # (n, k)

        _log_array(logger, U2, "U2.3")

        if self.persist_embedding and isinstance(U2, da.Array):
            logger.info("Persisting array for k-means")
            U2 = U2.persist()
        elif isinstance(U2, da.Array):
            logger.info(
                "Consider persist_embedding. This will require %s",
                _format_bytes(U2.nbytes),
            )
            pass
        logger.info("k-means for assign_labels[starting]")
        km.fit(U2)
        logger.info("k-means for assign_labels[finished]")

        # Now... what to keep?
        self.assign_labels_ = km
        self.labels_ = km.labels_
        self.eigenvalues_ = S_A[:n_clusters]  # TODO: better name
        return self
9.
industrial-optimization-group/DESDEO · misc.py
Match rating: 56.16% · See similar code snippets
python logo
def _centroids(n_clusters: int, points: List[List[float]]) -> List[List[float]]:
    """ Return n_clusters centroids of points
    """

    k_means = KMeans(n_clusters=n_clusters)
    k_means.fit(points)

    closest, _ = pairwise_distances_argmin_min(k_means.cluster_centers_, points)

    return list(map(list, np.array(points)[closest.tolist()]))
10.
vmirly/pyclust · _kmeans.py
Match rating: 56.14% · See similar code snippets
python logo
def fit(self, X):
        """ Apply KMeans Clustering
              X: dataset with feature vectors
        """
        self.centers_, self.labels_, self.sse_arr_, self.n_iter_ = \
              _kmeans(X, self.n_clusters, self.max_iter, self.n_trials, self.tol)