在PySpark Datafame中使用自定义度量函数



我在python中定义了一个自定义函数,以一对一的方式计算类内auc分数。它将真实类和不同类的概率作为输入,并返回类的auc分数。

from sklearn.metrics import roc_curve, auc
import pandas as pd
def mclass_auc(y_true, y_pred, n_class):
tp = {}
fp = {}
aucs = {}
for i in range(n_class):
classes = [0]*n_class
classes[i] = 1
fp[i] tp[i], th = roc_curve(y_true.replace(list(range(n_class)), classes), y_pred[:, i])
aucs[i] = auc(fp[i], tp[i])
return aucs

为了简单起见,我生成了一些概率值,它们的总和不是一。

cola = [np.random.randint(40, 81)/100 for i in range(10000)]
colb = [np.random.randint(30, 801)/1000 for i in range(10000)]
colc = [np.random.randint(40, 81)/200 for i in range(10000)]
coly = [np.random.randint(0, 4) for i in range(10000)]
sample_df = pd.DataFrame({'0':cola, '1':colb, '2':colc, 'y':coly})
y_true = sample_df['y']
y_pred = sample_df[['1','2','3']].values
auc_multiclass(y_true, y_pred, 3)
sql.createDataFrame(sample_df)

在python中,我可以使用上面的函数。有人能帮我在PySpark数据帧设置中计算这个吗?在这种情况下,将其更改为pandas数据帧并进行计算是失败的。

from pyspark.mllib.evaluation import BinaryClassificationMetrics
import pyspark.sql.functions as F
def mclass_auc_spark(y_true, y_pred, n_class):
aucs = {}
for i in range(n_class):
pred = y_pred.select(str(i)).withColumn('row_id', F.monotonically_increasing_id())
true = y_true.withColumn('y', F.when(F.col('y') == i, 1.0).otherwise(0.0)).withColumn('row_id', F.monotonically_increasing_id())
pred_labels = pred.join(true,on='row_id')
metric = BinaryClassificationMetrics(pred_labels.select('y',str(i)).rdd)
aucs[i] = metric.areaUnderROC
return aucs
spark_df = sql.createDataFrame(sample_df)
y_true = spark_df.select('y')
y_pred = spark_df.select('0','1','2')
auc_scores = mclass_auc(y_true, y_pred, 3)

最新更新