markovmodel/PyEMMA · api.py
python logo
def cluster_mini_batch_kmeans(data=None, k=100, max_iter=10, batch_size=0.2, metric='euclidean',
                              init_strategy='kmeans++', n_jobs=None, chunksize=None, skip=0, clustercenters=None, **kwargs):
    r"""k-means clustering with mini-batch strategy

    Mini-batch k-means is an approximation to k-means which picks a randomly
    selected subset of data points to be updated in each iteration. Usually
    much faster than k-means but will likely deliver a less optimal result.

    Returns
    -------
    kmeans_mini : a :class:`MiniBatchKmeansClustering <pyemma.coordinates.clustering.MiniBatchKmeansClustering>` clustering object
        Object for mini-batch kmeans clustering.
        It holds discrete trajectories and cluster center information.

    See also
    --------
    :func:`kmeans <pyemma.coordinates.kmeans>` : for full k-means clustering


    .. autoclass:: pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering
        :members:
        :undoc-members:

        .. rubric:: Methods

        .. autoautosummary:: pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering
           :methods:

        .. rubric:: Attributes

        .. autoautosummary:: pyemma.coordinates.clustering.kmeans.MiniBatchKmeansClustering
            :attributes:

    References
    ----------
    .. [1] http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf

    """
    from pyemma.coordinates.clustering.kmeans import MiniBatchKmeansClustering
    res = MiniBatchKmeansClustering(n_clusters=k, max_iter=max_iter, metric=metric, init_strategy=init_strategy,
                                    batch_size=batch_size, n_jobs=n_jobs, skip=skip, clustercenters=clustercenters)
    from pyemma.util.reflection import get_default_args
    cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_mini_batch_kmeans)['chunksize'], **kwargs)
    if data is not None:
        res.estimate(data, chunksize=cs)
    else:
        res.chunksize = chunksize
    return res
Similar code snippets
1.
markovmodel/PyEMMA · api.py
Match rating: 69.93% · See similar code snippets
python logo
def cluster_kmeans(data=None, k=None, max_iter=10, tolerance=1e-5, stride=1,
                   metric='euclidean', init_strategy='kmeans++', fixed_seed=False,
                   n_jobs=None, chunksize=None, skip=0, keep_data=False, clustercenters=None, **kwargs):
    r"""k-means clustering

    If data is given, it performs a k-means clustering and then assigns the
    data using a Voronoi discretization. It returns a :class:`KmeansClustering <pyemma.coordinates.clustering.KmeansClustering>`
    object that can be used to extract the discretized data sequences, or to
    assign other data points to the same partition. If data is not given, an
    empty :class:`KmeansClustering <pyemma.coordinates.clustering.KmeansClustering>`
    will be created that still needs to be parametrized, e.g. in a :func:`pipeline`.

    Parameters
    ----------
    data: ndarray (T, d) or list of ndarray (T_i, d) or a reader created by :func:`source`
        input data, if available in memory

    k: int
        the number of cluster centers. When not specified (None), min(sqrt(N), 5000) is chosen as default value,
        where N denotes the number of data points

    max_iter : int
        maximum number of iterations before stopping. When not specified (None), min(sqrt(N),5000) is chosen
        as default value, where N denotes the number of data points

    tolerance : float
        stop iteration when the relative change in the cost function

        :math:`C(S) = \sum_{i=1}^{k} \sum_{\mathbf x \in S_i} \left\| \mathbf x - \boldsymbol\mu_i \right\|^2`

        is smaller than tolerance.

    stride : int, optional, default = 1
        If set to 1, all input data will be used for estimation. Note that this
        could cause this calculation to be very slow for large data sets. Since
        molecular dynamics data is usually correlated at short timescales, it
        is often sufficient to estimate transformations at a longer stride.
        Note that the stride option in the get_output() function of the returned
        object is independent, so you can parametrize at a long stride, and
        still map all frames through the transformer.

    metric : str
        metric to use during clustering ('euclidean', 'minRMSD')

    init_strategy : str
        determines if the initial cluster centers are chosen according to the kmeans++-algorithm
        or drawn uniformly distributed from the provided data set

    fixed_seed : bool or (positive) integer
        if set to true, the random seed gets fixed resulting in deterministic behavior; default is false.
        If an integer >= 0 is given, use this to initialize the random generator.

    n_jobs : int or None, default None
        Number of threads to use during assignment of the data.
        If None, all available CPUs will be used.

    chunksize: int, default=None
        Number of data frames to process at once. Choose a higher value here,
        to optimize thread usage and gain processing speed. If None is passed,
        use the default value of the underlying reader/data source. Choose zero to
        disable chunking at all.

    skip : int, default=0
        skip the first initial n frames per trajectory.

    keep_data: boolean, default=False
        if you intend to quickly resume a non-converged kmeans iteration, set this to True.
        Otherwise the linear memory array will have to be re-created. Note that the data will also be deleted,
        if and only if the estimation converged within the given tolerance parameter.

    clustercenters: ndarray (k, dim), default=None
        if passed, the init_strategy is ignored and these centers will be iterated.

    Returns
    -------
    kmeans : a :class:`KmeansClustering <pyemma.coordinates.clustering.KmeansClustering>` clustering object
        Object for kmeans clustering.
        It holds discrete trajectories and cluster center information.


    Examples
    --------

    >>> import numpy as np
    >>> from pyemma.util.contexts import settings
    >>> import pyemma.coordinates as coor
    >>> traj_data = [np.random.random((100, 3)), np.random.random((100,3))]
    >>> with settings(show_progress_bars=False):
    ...     cluster_obj = coor.cluster_kmeans(traj_data, k=20, stride=1)
    ...     cluster_obj.get_output() # doctest: +ELLIPSIS
    [array([...

    .. seealso:: **Theoretical background**: `Wiki page <http://en.wikipedia.org/wiki/K-means_clustering>`_


    .. autoclass:: pyemma.coordinates.clustering.kmeans.KmeansClustering
        :members:
        :undoc-members:

        .. rubric:: Methods

        .. autoautosummary:: pyemma.coordinates.clustering.kmeans.KmeansClustering
           :methods:

        .. rubric:: Attributes

        .. autoautosummary:: pyemma.coordinates.clustering.kmeans.KmeansClustering
            :attributes:

    References
    ----------
    The k-means algorithms was invented in [1]_. The term k-means was
    first used in [2]_.

    .. [1] Steinhaus, H. (1957).
        Sur la division des corps materiels en parties.
        Bull. Acad. Polon. Sci. (in French) 4, 801-804.

    .. [2] MacQueen, J. B. (1967).
        Some Methods for classification and Analysis of Multivariate Observations.
        Proceedings of 5th Berkeley Symposium on Mathematical Statistics and
        Probability 1. University of California Press. pp. 281-297

    """
    from pyemma.coordinates.clustering.kmeans import KmeansClustering
    res = KmeansClustering(n_clusters=k, max_iter=max_iter, metric=metric, tolerance=tolerance,
                           init_strategy=init_strategy, fixed_seed=fixed_seed, n_jobs=n_jobs, skip=skip,
                           keep_data=keep_data, clustercenters=clustercenters, stride=stride)
    from pyemma.util.reflection import get_default_args
    cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_kmeans)['chunksize'], **kwargs)
    if data is not None:
        res.estimate(data, chunksize=cs)
    else:
        res.chunksize = cs
    return res
2.
markovmodel/PyEMMA · kmeans.py
Match rating: 62.48% · See similar code snippets
python logo
def __init__(self, n_clusters, max_iter=5, metric='euclidean', tolerance=1e-5, init_strategy='kmeans++',
                 batch_size=0.2, oom_strategy='memmap', fixed_seed=False, stride=None, n_jobs=None, skip=0,
                 clustercenters=None, keep_data=False):

        if stride is not None:
            raise ValueError("stride is a dummy value in MiniBatch Kmeans")
        if batch_size > 1:
            raise ValueError("batch_size should be less or equal to 1, but was %s" % batch_size)
        if keep_data:
            raise ValueError("keep_data is a dummy value in MiniBatch Kmeans")

        super(MiniBatchKmeansClustering, self).__init__(n_clusters, max_iter, metric,
                                                        tolerance, init_strategy, False,
                                                        oom_strategy, stride=stride, n_jobs=n_jobs, skip=skip,
                                                        clustercenters=clustercenters, keep_data=False)

        self.set_params(batch_size=batch_size)
3.
daler/metaseq · plotutils.py
Match rating: 58.85% · See similar code snippets
python logo
def new_clustered_sortind(x, k=10, row_key=None, cluster_key=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param row_key:
        Optional function to act as a sort key for sorting rows within
        clusters.  Signature should be `scorefunc(a)` where `a` is a 1-D NumPy
        array.
    :param cluster_key:
        Optional function for sorting clusters.  Signature is `clusterfunc(a)`
        where `a` is a NumPy array containing all rows of `x` for cluster `i`.
        It must return a single value.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if cluster_key:
        # It's easier for calling code to provide something that operates on
        # a cluster level, but here it's converted to work on a label level
        # that looks in to the array `x`.
        def _cluster_key(i):
            return cluster_key(x[labels == i, :])
        sorted_labels = sorted(range(k), key=_cluster_key)
    else:
        # Otherwise just use them as-is.
        sorted_labels = range(k)

    if row_key:
        # Again, easier to provide a function to operate on a row.  But here we
        # need it to accept an index
        def _row_key(i):
            return row_key(x[i, :])

    final_ind = []
    breaks = []
    pos = 0
    for label in sorted_labels:
        # which rows in `x` have this label
        label_inds = np.nonzero(labels == label)[0]
        if row_key:
            label_sort_ind = sorted(label_inds, key=_row_key)
        else:
            label_sort_ind = label_inds
        for li in label_sort_ind:
            final_ind.append(li)
        pos += len(label_inds)
        breaks.append(pos)

    return np.array(final_ind), np.array(breaks)
4.
brainiak/brainiak · tfa.py
Match rating: 58.69% · See similar code snippets
python logo
def init_centers_widths(self, R):
        """Initialize prior of centers and widths

        Returns
        -------

        centers : 2D array, with shape [K, n_dim]
            Prior of factors' centers.

        widths : 1D array, with shape [K, 1]
            Prior of factors' widths.

        """

        kmeans = KMeans(
            init='k-means++',
            n_clusters=self.K,
            n_init=10,
            random_state=100)
        kmeans.fit(R)
        centers = kmeans.cluster_centers_
        widths = self._get_max_sigma(R) * np.ones((self.K, 1))
        return centers, widths
5.
ismms-himc/clustergrammer2 · downsample_fun.py
Match rating: 58.29% · See similar code snippets
python logo
def calc_mbk_clusters(X, n_clusters, random_state=1000):

  # kmeans is run with rows as data-points and columns as dimensions
  mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters,
                         max_no_improvement=100, verbose=0,
                         random_state=random_state)

  # need to loop through each label (each k-means cluster) and count how many
  # points were given this label. This will give the population size of each label
  mbk.fit(X)
  cluster_data = mbk.labels_
  clusters = mbk.cluster_centers_

  mbk_cluster_names, cluster_pop = np.unique(cluster_data, return_counts=True)

  num_returned_clusters = len(cluster_pop)

  return clusters, num_returned_clusters, cluster_data, cluster_pop
6.
neuropsychology/NeuroKit.py · eeg_microstates.py
Match rating: 58.05% · See similar code snippets
python logo
def eeg_microstates_clustering(data, n_microstates=4, clustering_method="kmeans", n_jobs=1, n_init=25, occurence_rejection_treshold=0.05, max_refitting=5, verbose=True):
    """
    Fit the clustering algorithm.
    """
    # Create training set
    training_set = data.copy()

    if verbose is True:
        print("- Initializing the clustering algorithm...")
    if clustering_method == "kmeans":
        algorithm = sklearn.cluster.KMeans(init='k-means++', n_clusters=n_microstates, n_init=n_init, n_jobs=n_jobs)
    elif clustering_method == "spectral":
        algorithm = sklearn.cluster.SpectralClustering(n_clusters=n_microstates, n_init=n_init, n_jobs=n_jobs)
    elif clustering_method == "agglom":
        algorithm = sklearn.cluster.AgglomerativeClustering(n_clusters=n_microstates, linkage="complete")
    elif clustering_method == "dbscan":
        algorithm = sklearn.cluster.DBSCAN(min_samples=100)
    elif clustering_method == "affinity":
        algorithm = sklearn.cluster.AffinityPropagation(damping=0.5)
    else:
        print("NeuroKit Error: eeg_microstates(): clustering_method must be 'kmeans', 'spectral', 'dbscan', 'affinity' or 'agglom'")


    refitting = 0  # Initialize the number of refittings
    good_fit_achieved = False
    while good_fit_achieved is False:
        good_fit_achieved = True
        if verbose is True:
            print("- Fitting the classifier...")
        # Fit the algorithm
        algorithm.fit(training_set)

        if verbose is True:
            print("- Clustering back the initial data...")
        # Predict the more likely cluster for each observation
        predicted = algorithm.fit_predict(training_set)

        if verbose is True:
            print("- Check for abnormalities...")
        # Check for abnormalities and prune the training set until none found
        occurences = dict(collections.Counter(predicted))
        masks = [np.array([True]*len(training_set))]
        for microstate in occurences:
            # is the frequency of one microstate inferior to a treshold
            if occurences[microstate] < len(data)*occurence_rejection_treshold:
                good_fit_achieved = False
                refitting += 1  # Increment the refitting
                print("NeuroKit Warning: eeg_microstates(): detected some outliers: refitting the classifier (n=" + str(refitting) + ").")
                masks.append(predicted!=microstate)
        mask = np.all(masks, axis=0)
        training_set = training_set[mask]

    return(algorithm)
7.
daler/metaseq · plotutils.py
Match rating: 57.41% · See similar code snippets
python logo
def clustered_sortind(x, k=10, scorefunc=None):
    """
    Uses MiniBatch k-means clustering to cluster matrix into groups.

    Each cluster of rows is then sorted by `scorefunc` -- by default, the max
    peak height when all rows in a cluster are averaged, or
    cluster.mean(axis=0).max().

    Returns the index that will sort the rows of `x` and a list of "breaks".
    `breaks` is essentially a cumulative row count for each cluster boundary.
    In other words, after plotting the array you can use axhline on each
    "break" to plot the cluster boundary.

    If `k` is a list or tuple, iteratively try each one and select the best
    with the lowest mean distance from cluster centers.

    :param x: Matrix whose rows are to be clustered
    :param k: Number of clusters to create or a list of potential clusters; the
        optimum will be chosen from the list
    :param scorefunc: Optional function for sorting rows within clusters.  Must
        accept a single argument of a NumPy array.
    """
    try:
        from sklearn.cluster import MiniBatchKMeans
    except ImportError:
        raise ImportError('please install scikits.learn for '
                          'clustering.')

    # If integer, do it once and we're done
    if isinstance(k, int):
        best_k = k

    else:
        mean_dists = {}
        for _k in k:
            mbk = MiniBatchKMeans(init='k-means++', n_clusters=_k)
            mbk.fit(x)
            mean_dists[_k] = mbk.transform(x).mean()
        best_k = sorted(mean_dists.items(), key=lambda x: x[1])[-1][0]

    mbk = MiniBatchKMeans(init='k-means++', n_clusters=best_k)
    mbk.fit(x)
    k = best_k
    labels = mbk.labels_
    scores = np.zeros(labels.shape, dtype=float)

    if not scorefunc:
        def scorefunc(x):
            return x.mean(axis=0).max()

    for label in range(k):
        ind = labels == label
        score = scorefunc(x[ind, :])
        scores[ind] = score

    pos = 0
    breaks = []
    ind = np.argsort(scores)
    for k, g in itertools.groupby(labels[ind]):
        pos += len(list(g))
        breaks.append(pos)

    return ind, breaks
8.
oscarbranson/latools · clustering.py
Match rating: 56.28% · See similar code snippets
python logo
def cluster_kmeans(data, n_clusters, **kwargs):
    """
    Identify clusters using K - Means algorithm.

    Parameters
    ----------
    data : array_like
        array of size [n_samples, n_features].
    n_clusters : int
        The number of clusters expected in the data.

    Returns
    -------
    dict
        boolean array for each identified cluster.
    """
    km = cl.KMeans(n_clusters, **kwargs)
    kmf = km.fit(data)

    labels = kmf.labels_

    return labels, [np.nan]
9.
markovmodel/PyEMMA · api.py
Match rating: 56.09% · See similar code snippets
python logo
def cluster_regspace(data=None, dmin=-1, max_centers=1000, stride=1, metric='euclidean',
                     n_jobs=None, chunksize=None, skip=0, **kwargs):
    r"""Regular space clustering

    If given data, it performs a regular space clustering [1]_ and returns a
    :class:`RegularSpaceClustering <pyemma.coordinates.clustering.RegularSpaceClustering>` object that
    can be used to extract the discretized data sequences, or to assign other
    data points to the same partition. If data is not given, an empty
    :class:`RegularSpaceClustering <pyemma.coordinates.clustering.RegularSpaceClustering>` will be created
    that still needs to be parametrized, e.g. in a :func:`pipeline`.

    Regular space clustering is very similar to Hartigan's leader algorithm [2]_.
    It consists of two passes through the data. Initially, the first data point
    is added to the list of centers. For every subsequent data point, if it has
    a greater distance than dmin from every center, it also becomes a center.
    In the second pass, a Voronoi discretization with the computed centers is
    used to partition the data.

    Parameters
    ----------
    data : ndarray (T, d) or list of ndarray (T_i, d) or a reader created by :func:`source
        input data, if available in memory

    dmin : float
        the minimal distance between cluster centers

    max_centers : int (optional), default=1000
        If max_centers is reached, the algorithm will stop to find more centers,
        but it is possible that parts of the state space are not properly `
        discretized. This will generate a warning. If that happens, it is
        suggested to increase dmin such that the number of centers stays below
        max_centers.

    stride : int, optional, default = 1
        If set to 1, all input data will be used for estimation. Note that this
        could cause this calculation to be very slow for large data sets. Since
        molecular dynamics data is usually correlated at short timescales, it is
        often sufficient to estimate transformations at a longer stride. Note
        that the stride option in the get_output() function of the returned
        object is independent, so you can parametrize at a long stride, and
        still map all frames through the transformer.

    metric : str
        metric to use during clustering ('euclidean', 'minRMSD')

    n_jobs : int or None, default None
        Number of threads to use during assignment of the data.
        If None, all available CPUs will be used.

    chunksize: int, default=None
        Number of data frames to process at once. Choose a higher value here,
        to optimize thread usage and gain processing speed. If None is passed,
        use the default value of the underlying reader/data source. Choose zero to
        disable chunking at all.


    Returns
    -------
    regSpace : a :class:`RegularSpaceClustering <pyemma.coordinates.clustering.RegularSpaceClustering>` clustering  object
        Object for regular space clustering.
        It holds discrete trajectories and cluster center information.


    .. autoclass:: pyemma.coordinates.clustering.regspace.RegularSpaceClustering
        :members:
        :undoc-members:

        .. rubric:: Methods

        .. autoautosummary:: pyemma.coordinates.clustering.regspace.RegularSpaceClustering
           :methods:

        .. rubric:: Attributes

        .. autoautosummary:: pyemma.coordinates.clustering.regspace.RegularSpaceClustering
            :attributes:

    References
    ----------
    .. [1] Prinz J-H, Wu H, Sarich M, Keller B, Senne M, Held M, Chodera JD, Schuette Ch and Noe F. 2011.
        Markov models of molecular kinetics: Generation and Validation.
        J. Chem. Phys. 134, 174105.

    .. [2] Hartigan J. Clustering algorithms.
        New York: Wiley; 1975.

    """
    if dmin == -1:
        raise ValueError("provide a minimum distance for clustering, e.g. 2.0")
    from pyemma.coordinates.clustering.regspace import RegularSpaceClustering as _RegularSpaceClustering
    res = _RegularSpaceClustering(dmin, max_centers=max_centers, metric=metric,
                                  n_jobs=n_jobs, stride=stride, skip=skip)
    from pyemma.util.reflection import get_default_args
    cs = _check_old_chunksize_arg(chunksize, get_default_args(cluster_regspace)['chunksize'], **kwargs)
    if data is not None:
        res.estimate(data, chunksize=cs)
    else:
        res.chunksize = cs
    return res
10.
szairis/sakmapper · network.py
Match rating: 55.43% · See similar code snippets
python logo
def optimal_clustering(df, patch, method='kmeans', statistic='gap', max_K=5):
    if len(patch) == 1:
        return [patch]

    if statistic == 'db':
        if method == 'kmeans':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                kmeans = cluster.KMeans(n_clusters=k).fit(X)
                clustering[k] = pd.DataFrame(kmeans.predict(X), index=patch)
                dist_mu = squareform(pdist(kmeans.cluster_centers_))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

        elif method == 'agglomerative':
            if len(patch) <= 5:
                K_max = 2
            else:
                K_max = min(len(patch) / 2, max_K)
            clustering = {}
            db_index = []
            X = df.ix[patch, :]
            for k in range(2, K_max + 1):
                agglomerative = cluster.AgglomerativeClustering(n_clusters=k, linkage='average').fit(X)
                clustering[k] = pd.DataFrame(agglomerative.fit_predict(X), index=patch)
                tmp = [list(clustering[k][clustering[k][0] == i].index) for i in range(k)]
                centers = np.array([np.mean(X.ix[c, :], axis=0) for c in tmp])
                dist_mu = squareform(pdist(centers))
                sigma = []
                for i in range(k):
                    points_in_cluster = clustering[k][clustering[k][0] == i].index
                    sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum()))
                db_index.append(davies_bouldin(dist_mu, np.array(sigma)))
            db_index = np.array(db_index)
            k_optimal = np.argmin(db_index) + 2
            return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)]

    elif statistic == 'gap':
        X = np.array(df.ix[patch, :])
        if method == 'kmeans':
            f = cluster.KMeans
        gaps = gap(X, ks=range(1, min(max_K, len(patch))), method=f)
        k_optimal = list(gaps).index(max(gaps))+1
        clustering = pd.DataFrame(f(n_clusters=k_optimal).fit_predict(X), index=patch)
        return [list(clustering[clustering[0] == i].index) for i in range(k_optimal)]

    else:
        raise 'error: only db and gat statistics are supported'