如何使用单独的数据集在Databricks上验证automl结果



我在Databricks上执行AutoML功能。但我想在单独的数据集上验证模型。

由于我不太了解MLFlow,所以我尝试在split_test_df中插入新的数据集,并首先读取它。但没有成功。

笔记本内的代码如下:

import mlflow
import databricks.automl_runtime
target_col = "my_target_column"
from mlflow.tracking import MlflowClient
import os
import uuid
import shutil
import pandas as pd
# Create temp directory to download input data from MLflow
input_temp_dir = os.path.join(os.environ["SPARK_LOCAL_DIRS"], "tmp", str(uuid.uuid4())[:8])
os.makedirs(input_temp_dir)

# Download the artifact and read it into a pandas DataFrame
input_client = MlflowClient()
input_data_path = input_client.download_artifacts("some_numbers_and_letters", "data", input_temp_dir)
df_loaded = pd.read_parquet(os.path.join(input_data_path, "training_data"))
# Delete the temp data
shutil.rmtree(input_temp_dir)
# Preview data
df_loaded.head(5)
df = spark.read.format('delta').load(
'dbfs:/user/hive/warehouse/test_df/',
header=True,
inferSchema=True
)
from databricks.automl_runtime.sklearn.column_selector import ColumnSelector
supported_cols = ["there_are_my_columns"]
col_selector = ColumnSelector(supported_cols)

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
num_imputers = []
num_imputers.append(("impute_mean", SimpleImputer(), ["there_are_my_columns"]))
numerical_pipeline = Pipeline(steps=[
("converter", FunctionTransformer(lambda df: df.apply(pd.to_numeric, errors="coerce"))),
("imputers", ColumnTransformer(num_imputers)),
("standardizer", StandardScaler()),
])
numerical_transformers = [("there_are_my_columns"])]
from sklearn.compose import ColumnTransformer
transformers = numerical_transformers
preprocessor = ColumnTransformer(transformers, remainder="passthrough", sparse_threshold=0)
# AutoML completed train - validation - test split internally and used _automl_split_col_3da1 to specify the set
split_train_df = df_loaded.loc[df_loaded._automl_split_col_3da1 == "train"]
split_val_df = df_loaded.loc[df_loaded._automl_split_col_3da1 == "val"]
split_test_df = df.loc[df._automl_split_col_3da1 == 'test']  # here it throws an error that ttributeError: 'DataFrame' object has no attribute 'loc'
# Separate target column from features and drop _automl_split_col_3da1
X_train = split_train_df.drop([target_col, "_automl_split_col_3da1"], axis=1)
y_train = split_train_df[target_col]
X_val = split_val_df.drop([target_col, "_automl_split_col_3da1"], axis=1)
y_val = split_val_df[target_col]
X_test = split_test_df.drop(target_col)
y_test = split_test_df[target_col]

即使它在MLFlow上下文中被读取并处理到模型中,我仍然没有看到混淆矩阵中有任何变化,因为我也不确定它是否基于测试数据以及结果的准确性。

可以通过笔记本并将验证数据集重新分配给要执行验证的数据集来实现这一点。

最新更新