brainiak/brainiak · tfa.py
python logo
def init_centers_widths(self, R):
        """Initialize prior of centers and widths

        Returns
        -------

        centers : 2D array, with shape [K, n_dim]
            Prior of factors' centers.

        widths : 1D array, with shape [K, 1]
            Prior of factors' widths.

        """

        kmeans = KMeans(
            init='k-means++',
            n_clusters=self.K,
            n_init=10,
            random_state=100)
        kmeans.fit(R)
        centers = kmeans.cluster_centers_
        widths = self._get_max_sigma(R) * np.ones((self.K, 1))
        return centers, widths
Similar code snippets
1.
jasonlaska/spherecluster · von_mises_fisher_mixture.py
Match rating: 63.2% · See similar code snippets
python logo
def _init_unit_centers(X, n_clusters, random_state, init):
    """Initializes unit norm centers.

    Parameters
    ----------
    X : array-like or sparse matrix, shape=(n_samples, n_features)

    n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    init:  (string) one of
        k-means++ : uses sklearn k-means++ initialization algorithm
        spherical-k-means : use centroids from one pass of spherical k-means
        random : random unit norm vectors
        random-orthonormal : random orthonormal vectors
        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.
    """
    n_examples, n_features = np.shape(X)
    if isinstance(init, np.ndarray):
        n_init_clusters, n_init_features = init.shape
        assert n_init_clusters == n_clusters
        assert n_init_features == n_features

        # ensure unit normed centers
        centers = init
        for cc in range(n_clusters):
            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])

        return centers

    elif init == "spherical-k-means":
        labels, inertia, centers, iters = spherical_kmeans._spherical_kmeans_single_lloyd(
            X, n_clusters, x_squared_norms=np.ones((n_examples,)), init="k-means++"
        )

        return centers

    elif init == "random":
        centers = np.random.randn(n_clusters, n_features)
        for cc in range(n_clusters):
            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])

        return centers

    elif init == "k-means++":
        centers = _init_centroids(
            X,
            n_clusters,
            "k-means++",
            random_state=random_state,
            x_squared_norms=np.ones((n_examples,)),
        )

        for cc in range(n_clusters):
            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])

        return centers

    elif init == "random-orthonormal":
        centers = np.random.randn(n_clusters, n_features)
        q, r = np.linalg.qr(centers.T, mode="reduced")

        return q.T

    elif init == "random-class":
        centers = np.zeros((n_clusters, n_features))
        for cc in range(n_clusters):
            while np.linalg.norm(centers[cc, :]) == 0:
                labels = np.random.randint(0, n_clusters, n_examples)
                centers[cc, :] = X[labels == cc, :].sum(axis=0)

        for cc in range(n_clusters):
            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])

        return centers
2.
annoviko/pyclustering · kmeans_segmentation.py
Match rating: 60.65% · See similar code snippets
python logo
def template_segmentation_image_amount_colors(source, amount):
    data = read_image(source);

    centers = kmeans_plusplus_initializer(data, amount, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize();
    kmeans_instance = kmeans(data, centers);
    kmeans_instance.process();

    clusters = kmeans_instance.get_clusters();
    draw_image_mask_segments(source, clusters);
3.
oscarbranson/latools · clustering.py
Match rating: 59.87% · See similar code snippets
python logo
def cluster_kmeans(data, n_clusters, **kwargs):
    """
    Identify clusters using K - Means algorithm.

    Parameters
    ----------
    data : array_like
        array of size [n_samples, n_features].
    n_clusters : int
        The number of clusters expected in the data.

    Returns
    -------
    dict
        boolean array for each identified cluster.
    """
    km = cl.KMeans(n_clusters, **kwargs)
    kmf = km.fit(data)

    labels = kmf.labels_

    return labels, [np.nan]
4.
dragnet-org/dragnet · weninger.py
Match rating: 59.45% · See similar code snippets
python logo
def fit(self, blocks, y=None):
        """
        Fit a k-means clustering model using an ordered sequence of blocks.
        """
        self.kmeans.fit(make_weninger_features(blocks))
        # set the cluster center closest to the origin to exactly (0.0, 0.0)
        self.kmeans.cluster_centers_.sort(axis=0)
        self.kmeans.cluster_centers_[0, :] = np.zeros(2)
        return self
5.
oscarbranson/latools · classifier_obj.py
Match rating: 58.92% · See similar code snippets
python logo
def fit_kmeans(self, data, n_clusters, **kwargs):
        """
        Fit KMeans clustering algorithm to data.

        Parameters
        ----------
        data : array-like
            A dataset formatted by `classifier.fitting_data`.
        n_clusters : int
            The number of clusters in the data.
        **kwargs
            passed to `sklearn.cluster.KMeans`.

        Returns
        -------
        Fitted `sklearn.cluster.KMeans` object.
        """
        km = cl.KMeans(n_clusters=n_clusters, **kwargs)
        km.fit(data)
        return km
6.
markovmodel/PyEMMA · api.py
Match rating: 58.69% · See similar code snippets
python logo
def cluster_mini_batch_kmeans(data=None, k=100, max_iter=10, batch_size=0.2, metric='euclidean',
                              init_strategy='kmeans++', n_jobs=None, chunksize=None, skip=0, clustercenters=None, **kwargs):
    r"""k-means clustering with mini-batch strategy

    Mini-batch k-means is an approximation to k-means which picks a randomly
    selected subset of data points to be updated in each iteration. Usually
    much faster than k-means but will likely deliver a less optimal result.

    Returns
    -------
    kmeans_mini : a :class:`MiniBatchKmeansClustering <pyemma.coordinates.clustering.MiniBatchKmeansClustering>` clustering object
        Object for mini-batch kmeans clustering.
        It holds discrete trajectories and cluster center information.

    See also
    --------
    :func:`kmeans <pyemma.coordinates.kmeans>` : for full k-means clustering


    .. autoclass:: pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering
        :members:
        :undoc-members:

        .. rubric:: Methods

        .. autoautosummary:: pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering
           :methods:

        .. rubric:: Attributes

        .. autoautosummary:: pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering
            :attributes:

    References
    ----------
    .. [1] http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf

    """
    from pyemma.coordinates.clustering.kmeans import MiniBatchKmeansClustering
    res = MiniBatchKmeansClustering(n_clusters=k, max_iter=max_iter, metric=metric, init_strategy=init_strategy,
                                    batch_size=batch_size, n_jobs=n_jobs, skip=skip, clustercenters=clustercenters)
    from pyemma.util.reflection import get_default_args
    cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_mini_batch_kmeans)['chunksize'], **kwargs)
    if data is not None:
        res.estimate(data, chunksize=cs)
    else:
        res.chunksize = chunksize
    return res
7.
vmirly/pyclust · _kmeans.py
Match rating: 58.63% · See similar code snippets
python logo
def _kmeans_init(X, n_clusters, method='balanced', rng=None):
    """ Initialize k=n_clusters centroids randomly
    """
    n_samples = X.shape[0]
    if rng is None:
        cent_idx = np.random.choice(n_samples, replace=False, size=n_clusters)
    else:
        #print('Generate random centers using RNG')
        cent_idx = rng.choice(n_samples, replace=False, size=n_clusters)
    
    centers = X[cent_idx,:]
    mean_X = np.mean(X, axis=0)
    
    if method == 'balanced':
        centers[n_clusters-1] = n_clusters*mean_X - np.sum(centers[:(n_clusters-1)], axis=0)
    
    return (centers)
8.
daler/metaseq · plotutils.py
Match rating: 58.49% · See similar code snippets
python logo
def new_clustered_sortind(x, k=10, row_key=None, cluster_key=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param row_key:
        Optional function to act as a sort key for sorting rows within
        clusters.  Signature should be `scorefunc(a)` where `a` is a 1-D NumPy
        array.
    :param cluster_key:
        Optional function for sorting clusters.  Signature is `clusterfunc(a)`
        where `a` is a NumPy array containing all rows of `x` for cluster `i`.
        It must return a single value.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if cluster_key:
        # It's easier for calling code to provide something that operates on
        # a cluster level, but here it's converted to work on a label level
        # that looks in to the array `x`.
        def _cluster_key(i):
            return cluster_key(x[labels == i, :])
        sorted_labels = sorted(range(k), key=_cluster_key)
    else:
        # Otherwise just use them as-is.
        sorted_labels = range(k)

    if row_key:
        # Again, easier to provide a function to operate on a row.  But here we
        # need it to accept an index
        def _row_key(i):
            return row_key(x[i, :])

    final_ind = []
    breaks = []
    pos = 0
    for label in sorted_labels:
        # which rows in `x` have this label
        label_inds = np.nonzero(labels == label)[0]
        if row_key:
            label_sort_ind = sorted(label_inds, key=_row_key)
        else:
            label_sort_ind = label_inds
        for li in label_sort_ind:
            final_ind.append(li)
        pos += len(label_inds)
        breaks.append(pos)

    return np.array(final_ind), np.array(breaks)
9.
jasonlaska/spherecluster · von_mises_fisher_mixture.py
Match rating: 57.93% · See similar code snippets
python logo
def _movMF(
    X,
    n_clusters,
    posterior_type="soft",
    force_weights=None,
    max_iter=300,
    verbose=False,
    init="random-class",
    random_state=None,
    tol=1e-6,
):
    """Mixture of von Mises Fisher clustering.

    Implements the algorithms (i) and (ii) from

      "Clustering on the Unit Hypersphere using von Mises-Fisher Distributions"
      by Banerjee, Dhillon, Ghosh, and Sra.

    TODO: Currently only supports Banerjee et al 2005 approximation of kappa,
          however, there are numerous other approximations see _update_params.

    Attribution
    ----------
    Approximation of log-vmf distribution function from movMF R-package.

    movMF: An R Package for Fitting Mixtures of von Mises-Fisher Distributions
    by Kurt Hornik, Bettina Grun, 2014

    Find more at:
      https://cran.r-project.org/web/packages/movMF/vignettes/movMF.pdf
      https://cran.r-project.org/web/packages/movMF/index.html

    Parameters
    ----------
    n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.

    posterior_type: 'soft' or 'hard'
        Type of posterior computed in exepectation step.
        See note about attribute: self.posterior_

    force_weights : None or array [n_clusters, ]
        If None, the algorithm will estimate the weights.
        If an array of weights, algorithm will estimate concentrations and
        centers with given weights.

    max_iter : int, default: 300
        Maximum number of iterations of the k-means algorithm for a
        single run.

    n_init : int, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    init:  (string) one of
        random-class [default]: random class assignment & centroid computation
        k-means++ : uses sklearn k-means++ initialization algorithm
        spherical-k-means : use centroids from one pass of spherical k-means
        random : random unit norm vectors
        random-orthonormal : random orthonormal vectors
        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.

    tol : float, default: 1e-6
        Relative tolerance with regards to inertia to declare convergence

    n_jobs : int
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    verbose : int, default 0
        Verbosity mode.

    copy_x : boolean, default True
        When pre-computing distances it is more numerically accurate to center
        the data first.  If copy_x is True, then the original data is not
        modified.  If False, the original data is modified, and put back before
        the function returns, but small numerical differences may be introduced
        by subtracting and then adding the data mean.
    """
    random_state = check_random_state(random_state)
    n_examples, n_features = np.shape(X)

    # init centers (mus)
    centers = _init_unit_centers(X, n_clusters, random_state, init)

    # init weights (alphas)
    if force_weights is None:
        weights = np.ones((n_clusters,))
        weights = weights / np.sum(weights)
    else:
        weights = force_weights

    # init concentrations (kappas)
    concentrations = np.ones((n_clusters,))

    if verbose:
        print("Initialization complete")

    for iter in range(max_iter):
        centers_prev = centers.copy()

        # expectation step
        posterior = _expectation(
            X, centers, weights, concentrations, posterior_type=posterior_type
        )

        # maximization step
        centers, weights, concentrations = _maximization(
            X, posterior, force_weights=force_weights
        )

        # check convergence
        tolcheck = squared_norm(centers_prev - centers)
        if tolcheck <= tol:
            if verbose:
                print(
                    "Converged at iteration %d: "
                    "center shift %e within tolerance %e" % (iter, tolcheck, tol)
                )
            break

    # labels come for free via posterior
    labels = np.zeros((n_examples,))
    for ee in range(n_examples):
        labels[ee] = np.argmax(posterior[:, ee])

    inertia = _inertia_from_labels(X, centers, labels)

    return centers, weights, concentrations, posterior, labels, inertia
10.
dask/dask-ml · k_means.py
Match rating: 57.61% · See similar code snippets
python logo
def k_means(
    X,
    n_clusters,
    init="k-means||",
    precompute_distances="auto",
    n_init=1,
    max_iter=300,
    verbose=False,
    tol=1e-4,
    random_state=None,
    copy_x=True,
    n_jobs=-1,
    algorithm="full",
    return_n_iter=False,
    oversampling_factor=2,
    init_max_iter=None,
):
    """K-means algorithm for clustering

    Differences from scikit-learn:

    * init='k-means||'
    * oversampling_factor keyword
    * n_jobs=-1
    """
    labels, inertia, centers, n_iter = _kmeans_single_lloyd(
        X,
        n_clusters,
        max_iter=max_iter,
        init=init,
        verbose=verbose,
        tol=tol,
        random_state=random_state,
        oversampling_factor=oversampling_factor,
        init_max_iter=init_max_iter,
    )
    if return_n_iter:
        return labels, centers, inertia, n_iter
    else:
        return labels, centers, inertia