industrial-optimization-group/DESDEO · misc.py
python logo
def _centroids(n_clusters: int, points: List[List[float]]) -> List[List[float]]:
    """ Return n_clusters centroids of points
    """

    k_means = KMeans(n_clusters=n_clusters)
    k_means.fit(points)

    closest, _ = pairwise_distances_argmin_min(k_means.cluster_centers_, points)

    return list(map(list, np.array(points)[closest.tolist()]))
Similar code snippets
1.
oscarbranson/latools · classifier_obj.py
Match rating: 61.56% · See similar code snippets
python logo
def fit_kmeans(self, data, n_clusters, **kwargs):
        """
        Fit KMeans clustering algorithm to data.

        Parameters
        ----------
        data : array-like
            A dataset formatted by `classifier.fitting_data`.
        n_clusters : int
            The number of clusters in the data.
        **kwargs
            passed to `sklearn.cluster.KMeans`.

        Returns
        -------
        Fitted `sklearn.cluster.KMeans` object.
        """
        km = cl.KMeans(n_clusters=n_clusters, **kwargs)
        km.fit(data)
        return km
2.
oscarbranson/latools · clustering.py
Match rating: 58.68% · See similar code snippets
python logo
def cluster_kmeans(data, n_clusters, **kwargs):
    """
    Identify clusters using K - Means algorithm.

    Parameters
    ----------
    data : array_like
        array of size [n_samples, n_features].
    n_clusters : int
        The number of clusters expected in the data.

    Returns
    -------
    dict
        boolean array for each identified cluster.
    """
    km = cl.KMeans(n_clusters, **kwargs)
    kmf = km.fit(data)

    labels = kmf.labels_

    return labels, [np.nan]
3.
datamole-ai/active-semi-supervised-clustering · mpckmeans.py
Match rating: 58.14% · See similar code snippets
python logo
def _get_cluster_centers(self, X, labels):
        return np.array([X[labels == i].mean(axis=0) for i in range(self.n_clusters)])
4.
vmirly/pyclust · _kmeans.py
Match rating: 57.97% · See similar code snippets
python logo
def _kmeans_init(X, n_clusters, method='balanced', rng=None):
    """ Initialize k=n_clusters centroids randomly
    """
    n_samples = X.shape[0]
    if rng is None:
        cent_idx = np.random.choice(n_samples, replace=False, size=n_clusters)
    else:
        #print('Generate random centers using RNG')
        cent_idx = rng.choice(n_samples, replace=False, size=n_clusters)
    
    centers = X[cent_idx,:]
    mean_X = np.mean(X, axis=0)
    
    if method == 'balanced':
        centers[n_clusters-1] = n_clusters*mean_X - np.sum(centers[:(n_clusters-1)], axis=0)
    
    return (centers)
5.
dask/dask-ml · kmeans_airline.py
Match rating: 57.33% · See similar code snippets
python logo
def do(X, n_clusters, factor):
    km = KMeans(n_clusters=n_clusters, oversampling_factor=factor)
    km.fit(X)
    return km
6.
ismms-himc/clustergrammer2 · downsample_fun.py
Match rating: 56.34% · See similar code snippets
python logo
def calc_mbk_clusters(X, n_clusters, random_state=1000):

  # kmeans is run with rows as data-points and columns as dimensions
  mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters,
                         max_no_improvement=100, verbose=0,
                         random_state=random_state)

  # need to loop through each label (each k-means cluster) and count how many
  # points were given this label. This will give the population size of each label
  mbk.fit(X)
  cluster_data = mbk.labels_
  clusters = mbk.cluster_centers_

  mbk_cluster_names, cluster_pop = np.unique(cluster_data, return_counts=True)

  num_returned_clusters = len(cluster_pop)

  return clusters, num_returned_clusters, cluster_data, cluster_pop
7.
daler/metaseq · plotutils.py
Match rating: 55.04% · See similar code snippets
python logo
def new_clustered_sortind(x, k=10, row_key=None, cluster_key=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param row_key:
        Optional function to act as a sort key for sorting rows within
        clusters.  Signature should be `scorefunc(a)` where `a` is a 1-D NumPy
        array.
    :param cluster_key:
        Optional function for sorting clusters.  Signature is `clusterfunc(a)`
        where `a` is a NumPy array containing all rows of `x` for cluster `i`.
        It must return a single value.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if cluster_key:
        # It's easier for calling code to provide something that operates on
        # a cluster level, but here it's converted to work on a label level
        # that looks in to the array `x`.
        def _cluster_key(i):
            return cluster_key(x[labels == i, :])
        sorted_labels = sorted(range(k), key=_cluster_key)
    else:
        # Otherwise just use them as-is.
        sorted_labels = range(k)

    if row_key:
        # Again, easier to provide a function to operate on a row.  But here we
        # need it to accept an index
        def _row_key(i):
            return row_key(x[i, :])

    final_ind = []
    breaks = []
    pos = 0
    for label in sorted_labels:
        # which rows in `x` have this label
        label_inds = np.nonzero(labels == label)[0]
        if row_key:
            label_sort_ind = sorted(label_inds, key=_row_key)
        else:
            label_sort_ind = label_inds
        for li in label_sort_ind:
            final_ind.append(li)
        pos += len(label_inds)
        breaks.append(pos)

    return np.array(final_ind), np.array(breaks)
8.
ruipgil/TrackToTrip · spatiotemporal_segmentation.py
Match rating: 54.67% · See similar code snippets
python logo
def spatiotemporal_segmentation(points, eps, min_time):
    """ Splits a set of points into multiple sets of points based on
        spatio-temporal stays

    DBSCAN is used to predict possible segmentations,
        furthermore we check to see if each clusters is big enough in
        time (>=min_time). If that's the case than the segmentation is
        considered valid.

    When segmenting, the last point of the ith segment will be the same
        of the (i-1)th segment.

    Segments are identified through clusters.
    The last point of a clusters, that comes after a sub-segment A, will
        be present on the sub-segment A.

    Args:
        points (:obj:`list` of :obj:`Point`): segment's points
        eps (float): Epsilon to feed to the DBSCAN algorithm.
            Maximum distance between two samples, to be considered in
            the same cluster.
        min_time (float): Minimum time of a stay
    Returns:
        :obj:`list` of :obj:`list` of :obj:`Point`: Initial set of
            points in different segments
    """
    # min time / sample rate
    dt_average = np.median([point.dt for point in points])
    min_samples = min_time / dt_average

    data = [point.gen3arr() for point in points]
    data = StandardScaler().fit_transform(data)
    print 'min_samples: %f' % min_samples
    db_cluster = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
    labels = db_cluster.labels_

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    segments = [[] for _ in range(n_clusters_+1)]
    clusters = [[] for _ in range(n_clusters_+1)]
    current_segment = 0

    print 'clusters'
    print n_clusters_
    if n_clusters_ == 1:
        segments = temporal_segmentation([points], min_time)
        return [segment for segment in segments if len(segment) > 1]

    # split segments identified with dbscan
    for i, label in enumerate(labels):
        if label != -1 and label + 1 != current_segment:
            current_segment = label + 1
        point = points[i]
        if label == -1:
            segments[current_segment].append(point)
        else:
            clusters[label + 1].append(point)

    if len(segments) == 0 or sum([len(s) for s in segments]):
        segments = [points]

    segments = temporal_segmentation(segments, min_time)
    # segments = temporal_segmentation(correct_segmentation(segments, clusters, min_time), min_time)
    return [segment for segment in segments if len(segment) > 1]
9.
dask/dask-ml · k_means_kdd.py
Match rating: 54.64% · See similar code snippets
python logo
def fit(data, use_scikit_learn=False):
    logger.info("Starting to cluster")
    # Cluster
    n_clusters = 8
    oversampling_factor = 2
    if use_scikit_learn:
        km = sk.KMeans(n_clusters=n_clusters, random_state=0)
    else:
        km = KMeans(
            n_clusters=n_clusters,
            oversampling_factor=oversampling_factor,
            random_state=0,
        )
    logger.info(
        "Starting n_clusters=%2d, oversampling_factor=%2d",
        n_clusters,
        oversampling_factor,
    )
    with _timer("km.fit", _logger=logger):
        km.fit(data)
10.
szairis/sakmapper · network.py
Match rating: 54.62% · See similar code snippets
python logo
def optimal_clustering(df, patch, method='kmeans', statistic='gap', max_K=5):
    if len(patch) == 1:
        return [patch]

    if statistic == 'db':
        if method == 'kmeans':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                kmeans = cluster.KMeans(n_clusters=k).fit(X)
                clustering[k] = pd.DataFrame(kmeans.predict(X), index=patch)
                dist_mu = squareform(pdist(kmeans.cluster_centers_))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

        elif method == 'agglomerative':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                agglomerative = cluster.AgglomerativeClustering(n_clusters=k, linkage='average').fit(X)
                clustering[k] = pd.DataFrame(agglomerative.fit_predict(X), index=patch)
                tmp = [list(clustering[k][clustering[k][0] == i].index) for i in range(k)]
                centers = np.array([np.mean(X.ix[c, :], axis=0) for c in tmp])
                dist_mu = squareform(pdist(centers))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

    elif statistic == 'gap':
        X = np.array(df.ix[patch, :])
        if method == 'kmeans':
            f = cluster.KMeans
        gaps = gap(X, ks=range(1, min(max_K, len(patch))), method=f)
        k_optimal = list(gaps).index(max(gaps))+1
        clustering = pd.DataFrame(f(n_clusters=k_optimal).fit_predict(X), index=patch)
        return [list(clustering[clustering[0] == i].index) for i in range(k_optimal)]

    else:
        raise 'error: only db and gat statistics are supported'