RAND索引功能(聚类性能评估)



据我所知,python中的rand索引没有可用的软件包,而对于调整后的兰德索引,您可以选择使用sklearn.metrics.adjusted_rand_score(labels_true, labels_pred)

我写了兰德分数的代码,我将与他人分享作为帖子的答案。

from scipy.misc import comb
from itertools import combinations
import numpy as np
def check_clusterings(labels_true, labels_pred):
    """Check that the two clusterings matching 1D integer arrays."""
    labels_true = np.asarray(labels_true)
    labels_pred = np.asarray(labels_pred)    
    # input checks
    if labels_true.ndim != 1:
        raise ValueError(
            "labels_true must be 1D: shape is %r" % (labels_true.shape,))
    if labels_pred.ndim != 1:
        raise ValueError(
            "labels_pred must be 1D: shape is %r" % (labels_pred.shape,))
    if labels_true.shape != labels_pred.shape:
        raise ValueError(
            "labels_true and labels_pred must have same size, got %d and %d"
            % (labels_true.shape[0], labels_pred.shape[0]))
    return labels_true, labels_pred
def rand_score (labels_true, labels_pred):
"""given the true and predicted labels, it will return the Rand Index."""
    check_clusterings(labels_true, labels_pred)
    my_pair = list(combinations(range(len(labels_true)), 2)) #create list of all combinations with the length of labels.
    def is_equal(x):
        return (x[0]==x[1])
    my_a = 0
    my_b = 0
    for i in range(len(my_pair)):
            if(is_equal((labels_true[my_pair[i][0]],labels_true[my_pair[i][1]])) == is_equal((labels_pred[my_pair[i][0]],labels_pred[my_pair[i][1]])) 
               and is_equal((labels_pred[my_pair[i][0]],labels_pred[my_pair[i][1]])) == True):
                my_a += 1
            if(is_equal((labels_true[my_pair[i][0]],labels_true[my_pair[i][1]])) == is_equal((labels_pred[my_pair[i][0]],labels_pred[my_pair[i][1]])) 
               and is_equal((labels_pred[my_pair[i][0]],labels_pred[my_pair[i][1]])) == False):
                my_b += 1
    my_denom = comb(len(labels_true),2)
    ri = (my_a + my_b) / my_denom
    return ri

作为一个简单的示例:

labels_true = [1, 1, 0, 0, 0, 0]
labels_pred = [0, 0, 0, 1, 0, 1]
rand_score (labels_true, labels_pred)
#0.46666666666666667

可能有一些方法可以改善它并使其更具Pythonic。如果您有任何建议,则可以改进它。

我发现这个实现似乎更快。

import numpy as np
from scipy.misc import comb
def rand_index_score(clusters, classes):
    tp_plus_fp = comb(np.bincount(clusters), 2).sum()
    tp_plus_fn = comb(np.bincount(classes), 2).sum()
    A = np.c_[(clusters, classes)]
    tp = sum(comb(np.bincount(A[A[:, 0] == i, 1]), 2).sum()
             for i in set(clusters))
    fp = tp_plus_fp - tp
    fn = tp_plus_fn - tp
    tn = comb(len(A), 2) - tp - fp - fn
    return (tp + tn) / (tp + fp + fn + tn)

作为一个简单的示例:

labels_true = [1, 1, 0, 0, 0, 0]
labels_pred = [0, 0, 0, 1, 0, 1]
rand_index_score (labels_true, labels_pred)
#0.46666666666666667

从scikit-learn 0.24.0开始,添加了 sklearn.metrics.rand_score函数,实现了(未调整的(rand索引。请检查ChangElog。

您要做的就是:

from sklearn.metrics import rand_score
rand_score(labels_true, labels_pred)

labels_truelabels_pred可以在不同域中具有值。例如:

>>> rand_score(['a', 'b', 'c'], [5, 6, 7])
1.0

这是我的代码:

def rand_index_score(y_gold, y_predict):
    index1_index2_pairs = list(it.combinations(range(len(y_gold)), 2)) #create list of all combinations with the length of labels.
    numberOfPairs = len(index1_index2_pairs)
    fractalUpperPart = 0
    for index1_index2 in index1_index2_pairs:
        theyRealyAreInSameGroup = y_gold[index1_index2[0]] == y_gold[index1_index2[1]]
        itIsPredictedThatTheyAreInSameGroup =  y_predict[index1_index2[0]] == y_predict[index1_index2[1]]
        if theyRealyAreInSameGroup == itIsPredictedThatTheyAreInSameGroup:
            fractalUpperPart += 1
   
    return fractalUpperPart/numberOfPairs

最新更新