oscarbranson/latools · clustering.py
python logo
def cluster_kmeans(data, n_clusters, **kwargs):
    """
    Identify clusters using K - Means algorithm.

    Parameters
    ----------
    data : array_like
        array of size [n_samples, n_features].
    n_clusters : int
        The number of clusters expected in the data.

    Returns
    -------
    dict
        boolean array for each identified cluster.
    """
    km = cl.KMeans(n_clusters, **kwargs)
    kmf = km.fit(data)

    labels = kmf.labels_

    return labels, [np.nan]
Similar code snippets
1.
oscarbranson/latools · classifier_obj.py
Match rating: 77.42% · See similar code snippets
python logo
def fit_kmeans(self, data, n_clusters, **kwargs):
        """
        Fit KMeans clustering algorithm to data.

        Parameters
        ----------
        data : array-like
            A dataset formatted by `classifier.fitting_data`.
        n_clusters : int
            The number of clusters in the data.
        **kwargs
            passed to `sklearn.cluster.KMeans`.

        Returns
        -------
        Fitted `sklearn.cluster.KMeans` object.
        """
        km = cl.KMeans(n_clusters=n_clusters, **kwargs)
        km.fit(data)
        return km
2.
dask/dask-ml · kmeans_airline.py
Match rating: 66.62% · See similar code snippets
python logo
def do(X, n_clusters, factor):
    km = KMeans(n_clusters=n_clusters, oversampling_factor=factor)
    km.fit(X)
    return km
3.
ismms-himc/clustergrammer2 · downsample_fun.py
Match rating: 63.07% · See similar code snippets
python logo
def calc_mbk_clusters(X, n_clusters, random_state=1000):

  # kmeans is run with rows as data-points and columns as dimensions
  mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters,
                         max_no_improvement=100, verbose=0,
                         random_state=random_state)

  # need to loop through each label (each k-means cluster) and count how many
  # points were given this label. This will give the population size of each label
  mbk.fit(X)
  cluster_data = mbk.labels_
  clusters = mbk.cluster_centers_

  mbk_cluster_names, cluster_pop = np.unique(cluster_data, return_counts=True)

  num_returned_clusters = len(cluster_pop)

  return clusters, num_returned_clusters, cluster_data, cluster_pop
4.
datamole-ai/active-semi-supervised-clustering · mpckmeans.py
Match rating: 61.88% · See similar code snippets
python logo
def _get_cluster_centers(self, X, labels):
        return np.array([X[labels == i].mean(axis=0) for i in range(self.n_clusters)])
5.
dask/dask-ml · k_means_kdd.py
Match rating: 60.31% · See similar code snippets
python logo
def fit(data, use_scikit_learn=False):
    logger.info("Starting to cluster")
    # Cluster
    n_clusters = 8
    oversampling_factor = 2
    if use_scikit_learn:
        km = sk.KMeans(n_clusters=n_clusters, random_state=0)
    else:
        km = KMeans(
            n_clusters=n_clusters,
            oversampling_factor=oversampling_factor,
            random_state=0,
        )
    logger.info(
        "Starting n_clusters=%2d, oversampling_factor=%2d",
        n_clusters,
        oversampling_factor,
    )
    with _timer("km.fit", _logger=logger):
        km.fit(data)
6.
slundberg/shap · kernel.py
Match rating: 60.03% · See similar code snippets
python logo
def kmeans(X, k, round_values=True):
    """ Summarize a dataset with k mean samples weighted by the number of data points they
    each represent.

    Parameters
    ----------
    X : numpy.array or pandas.DataFrame
        Matrix of data samples to summarize (# samples x # features)

    k : int
        Number of means to use for approximation.

    round_values : bool
        For all i, round the ith dimension of each mean sample to match the nearest value
        from X[:,i]. This ensures discrete features always get a valid value.

    Returns
    -------
    DenseData object.
    """

    group_names = [str(i) for i in range(X.shape[1])]
    if str(type(X)).endswith("'pandas.core.frame.DataFrame'>"):
        group_names = X.columns
        X = X.values
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)

    if round_values:
        for i in range(k):
            for j in range(X.shape[1]):
                ind = np.argmin(np.abs(X[:,j] - kmeans.cluster_centers_[i,j]))
                kmeans.cluster_centers_[i,j] = X[ind,j]
    return DenseData(kmeans.cluster_centers_, group_names, None, 1.0*np.bincount(kmeans.labels_))
7.
brainiak/brainiak · tfa.py
Match rating: 59.87% · See similar code snippets
python logo
def init_centers_widths(self, R):
        """Initialize prior of centers and widths

        Returns
        -------

        centers : 2D array, with shape [K, n_dim]
            Prior of factors' centers.

        widths : 1D array, with shape [K, 1]
            Prior of factors' widths.

        """

        kmeans = KMeans(
            init='k-means++',
            n_clusters=self.K,
            n_init=10,
            random_state=100)
        kmeans.fit(R)
        centers = kmeans.cluster_centers_
        widths = self._get_max_sigma(R) * np.ones((self.K, 1))
        return centers, widths
8.
daler/metaseq · plotutils.py
Match rating: 59.39% · See similar code snippets
python logo
def new_clustered_sortind(x, k=10, row_key=None, cluster_key=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param row_key:
        Optional function to act as a sort key for sorting rows within
        clusters.  Signature should be `scorefunc(a)` where `a` is a 1-D NumPy
        array.
    :param cluster_key:
        Optional function for sorting clusters.  Signature is `clusterfunc(a)`
        where `a` is a NumPy array containing all rows of `x` for cluster `i`.
        It must return a single value.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if cluster_key:
        # It's easier for calling code to provide something that operates on
        # a cluster level, but here it's converted to work on a label level
        # that looks in to the array `x`.
        def _cluster_key(i):
            return cluster_key(x[labels == i, :])
        sorted_labels = sorted(range(k), key=_cluster_key)
    else:
        # Otherwise just use them as-is.
        sorted_labels = range(k)

    if row_key:
        # Again, easier to provide a function to operate on a row.  But here we
        # need it to accept an index
        def _row_key(i):
            return row_key(x[i, :])

    final_ind = []
    breaks = []
    pos = 0
    for label in sorted_labels:
        # which rows in `x` have this label
        label_inds = np.nonzero(labels == label)[0]
        if row_key:
            label_sort_ind = sorted(label_inds, key=_row_key)
        else:
            label_sort_ind = label_inds
        for li in label_sort_ind:
            final_ind.append(li)
        pos += len(label_inds)
        breaks.append(pos)

    return np.array(final_ind), np.array(breaks)
9.
dask/dask-ml · spectral.py
Match rating: 58.84% · See similar code snippets
python logo
def fit(self, X, y=None):
        X = self._check_array(X)
        n_components = self.n_components
        metric = self.affinity
        rng = check_random_state(self.random_state)
        n_clusters = self.n_clusters

        # kmeans for final clustering
        if isinstance(self.assign_labels, six.string_types):
            if self.assign_labels == "kmeans":
                km = KMeans(
                    n_clusters=n_clusters,
                    random_state=draw_seed(rng, np.iinfo("i4").max, dtype="uint"),
                )
            elif self.assign_labels == "sklearn-kmeans":
                km = sklearn.cluster.KMeans(n_clusters=n_clusters, random_state=rng)
            else:
                msg = "Unknown 'assign_labels' {!r}".format(self.assign_labels)
                raise ValueError(msg)
        elif isinstance(self.assign_labels, BaseEstimator):
            km = self.assign_labels
        else:
            raise TypeError(
                "Invalid type {} for 'assign_labels'".format(type(self.assign_labels))
            )

        if self.kmeans_params:
            km.set_params(**self.kmeans_params)

        n = len(X)
        if n <= n_components:
            msg = (
                "'n_components' must be smaller than the number of samples."
                " Got {} components and {} samples".format(n_components, n)
            )
            raise ValueError(msg)

        params = self.kernel_params or {}
        params["gamma"] = self.gamma
        params["degree"] = self.degree
        params["coef0"] = self.coef0

        # indices for our exact / approximate blocks
        inds = np.arange(n)
        keep = rng.choice(inds, n_components, replace=False)
        keep.sort()
        rest = ~np.isin(inds, keep)

        # compute the exact blocks
        # these are done in parallel for dask arrays
        if isinstance(X, da.Array):
            X_keep = X[keep].rechunk(X.shape).persist()
        else:
            X_keep = X[keep]

        X_rest = X[rest]

        A, B = embed(X_keep, X_rest, n_components, metric, params)
        _log_array(logger, A, "A")
        _log_array(logger, B, "B")

        # now the approximation of C
        a = A.sum(0)  # (l,)
        b1 = B.sum(1)  # (l,)
        b2 = B.sum(0)  # (m,)

        # TODO: I think we have some unnecessary delayed wrapping of A here.
        A_inv = da.from_delayed(delayed(pinv)(A), A.shape, A.dtype)

        inner = A_inv.dot(b1)
        d1_si = 1 / da.sqrt(a + b1)

        d2_si = 1 / da.sqrt(b2 + B.T.dot(inner))  # (m,), dask array

        # d1, d2 are diagonal, so we can avoid large matrix multiplies
        # Equivalent to diag(d1_si) @ A @ diag(d1_si)
        A2 = d1_si.reshape(-1, 1) * A * d1_si.reshape(1, -1)  # (n, n)
        _log_array(logger, A2, "A2")
        # A2 = A2.rechunk(A2.shape)
        # Equivalent to diag(d1_si) @ B @ diag(d2_si)
        B2 = da.multiply(da.multiply(d1_si.reshape(-1, 1), B), d2_si.reshape(1, -1))
        _log_array(logger, B2, "B2")

        U_A, S_A, V_A = delayed(svd, pure=True, nout=3)(A2)

        U_A = da.from_delayed(U_A, (n_components, n_components), A2.dtype)
        S_A = da.from_delayed(S_A, (n_components,), A2.dtype)
        V_A = da.from_delayed(V_A, (n_components, n_components), A2.dtype)

        # Eq 16. This is OK when V2 is orthogonal
        V2 = da.sqrt(float(n_components) / n) * da.vstack([A2, B2.T]).dot(
            U_A[:, :n_clusters]
        ).dot(
            da.diag(1.0 / da.sqrt(S_A[:n_clusters]))
        )  # (n, k)
        _log_array(logger, V2, "V2.1")

        if isinstance(B2, da.Array):
            V2 = V2.rechunk((B2.chunks[1][0], n_clusters))
            _log_array(logger, V2, "V2.2")

        # normalize (Eq. 4)
        U2 = (V2.T / da.sqrt((V2 ** 2).sum(1))).T  # (n, k)

        _log_array(logger, U2, "U2.2")

        # Recover original indices
        U2 = _slice_mostly_sorted(U2, keep, rest, inds)  # (n, k)

        _log_array(logger, U2, "U2.3")

        if self.persist_embedding and isinstance(U2, da.Array):
            logger.info("Persisting array for k-means")
            U2 = U2.persist()
        elif isinstance(U2, da.Array):
            logger.info(
                "Consider persist_embedding. This will require %s",
                _format_bytes(U2.nbytes),
            )
            pass
        logger.info("k-means for assign_labels[starting]")
        km.fit(U2)
        logger.info("k-means for assign_labels[finished]")

        # Now... what to keep?
        self.assign_labels_ = km
        self.labels_ = km.labels_
        self.eigenvalues_ = S_A[:n_clusters]  # TODO: better name
        return self
10.
daler/metaseq · plotutils.py
Match rating: 58.78% · See similar code snippets
python logo
def clustered_sortind(x, k=10, scorefunc=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param scorefunc: Optional function for sorting rows within clusters.  Must
        accept a single argument of a NumPy array.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if not scorefunc:
        def scorefunc(x):
            return x.mean(axis=0).max()

    for label in range(k):
        ind = labels == label
        score = scorefunc(x[ind, :])
        scores[ind] = score

    pos = 0
    breaks = []
    ind = np.argsort(scores)
    for k, g in itertools.groupby(labels[ind]):
        pos += len(list(g))
        breaks.append(pos)

    return ind, breaks