如何使用sklearn管道实现缓存



我看到了以下情况:使用scikit Pipeline测试模型,但只预处理数据一次,但这不起作用。我正在使用scikit-learn 1.0.2

示例:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from tempfile import mkdtemp
from joblib import Memory
import time
from shutil import rmtree
class Test(BaseEstimator, TransformerMixin):
def __init__(self, col):
self.col = col
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
for t in range(5):
# just to slow it down / check caching.
print(".")
time.sleep(1)
print(self.col)
cachedir = mkdtemp()
memory = Memory(location=cachedir, verbose=10)

pipline = Pipeline(
[
("test", Test(col="this_column")),
],
memory=memory,
)
pipline.fit_transform(None)

将显示:

.
.
.
.
.
this_column

当第二次调用它时,我期望缓存它,因此不必在this_column之前显示五个.n.n.n.n.输出。

然而,这并没有发生,它为我提供了带有time.sleep的for循环的输出。

为什么会发生这种情况?

似乎没有缓存管道的最后一步。这是一个稍微修改过的脚本版本。

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import time
class Test(BaseEstimator, TransformerMixin):
def __init__(self, col):
self.col = col
def fit(self, X, y=None):
print(self.col)
return self
def transform(self, X, y=None):
for t in range(5):
# just to slow it down / check caching.
print(".")
time.sleep(1)
#print(self.col)
return X
pipline = Pipeline(
[
("test", Test(col="this_column")),
("test2", Test(col="that_column"))
],
memory="tmp/cache",
)
pipline.fit(None)
pipline.fit(None)
pipline.fit(None)
#this_column
#.
#.
#.
#.
#.
#that_column
#that_column
#that_column

最新更新