如何在Tensorflow中将窗口数据集馈送到StringLookup层

如果我有以下数据，并希望使用StringLookup进行预处理:

x = pd.DataFrame({'col1': list('abcdefghij'), 'col2': np.arange(10), 'col3': np.arange(10)})
y = np.arange(10)

首先，我需要将我的窗口数据集转换为张量字典，因为模型期望张量作为输入(也许有更好的方法?):

window_size = 3
dataset = tf.data.Dataset.from_tensor_slices((dict(x), y)).window(window_size, shift=1, drop_remainder=True)
# Extra preprocessing to get dict of tensors
dataset = dataset.flat_map(
lambda x, y: tf.data.Dataset.zip(({k: v.batch(window_size) for k, v in x.items()}, y.batch(window_size)))
)
dataset = dataset.batch(3)
for i, j in dataset.take(1):
print(i, j)

输出:

{'col1': <tf.Tensor: shape=(3, 3), dtype=string, numpy=
array([[b'a', b'b', b'c'],
[b'b', b'c', b'd'],
[b'c', b'd', b'e']], dtype=object)>, 'col2': <tf.Tensor: shape=(3, 3), dtype=int64, numpy=
array([[0, 1, 2],
[1, 2, 3],
[2, 3, 4]])>, 'col3': <tf.Tensor: shape=(3, 3), dtype=int64, numpy=
array([[0, 1, 2],
[1, 2, 3],
[2, 3, 4]])>} tf.Tensor(
[[0 1 2]
[1 2 3]
[2 3 4]], shape=(3, 3), dtype=int64)

为不同的dtype创建预处理器，如下例所示:

inputs = {'col1': tf.keras.Input(shape=(), name='col1', dtype=tf.string),
'col2': tf.keras.Input(shape=(), name='col2', dtype=tf.float32),
'col3': tf.keras.Input(shape=(), name='col3', dtype=tf.float32)}
vocab = sorted(set(x['col1']))
lookup = tf.keras.layers.StringLookup(vocabulary=vocab, output_mode='one_hot')
lookup = lookup(inputs['col1'][:tf.newaxis])
numeric = tf.stack([tf.cast(inputs[i], dtype=tf.float32) for i in ['col2', 'col3']], axis=-1)
result = tf.concat([lookup, numeric], axis=-1)
preprocessor = tf.keras.Model(inputs, result)
# Test preprocessor
preprocessor(dict(x))

输出:

<tf.Tensor: shape=(10, 13), dtype=float32, numpy=
array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 2., 2.],
[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 3., 3.],
[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 4., 4.],
[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 5., 5.],
[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 6., 6.],
[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 7., 7.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 8., 8.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 9., 9.]],
dtype=float32)>

创建模型:

body = tf.keras.models.Sequential([tf.keras.layers.Dense(8),
tf.keras.layers.Dense(window_size)])
x = preprocessor(inputs)
result = body(x)
model = tf.keras.Model(inputs, result)
model.summary()

输出:

Model: "model"
__________________________________________________________________________________________________
Layer (type)                   Output Shape         Param #     Connected to                     
==================================================================================================
col1 (InputLayer)              [(None,)]            0           []                               
                            
col2 (InputLayer)              [(None,)]            0           []                               
                            
col3 (InputLayer)              [(None,)]            0           []                               
                            
model_35 (Functional)          (None, 13)           0           ['col1[0][0]',                   
'col2[0][0]',                   
'col3[0][0]']                   
                            
sequential_19 (Sequential)     (None, 3)            139         ['model_35[2][0]']               
                            
==================================================================================================
Total params: 139
Trainable params: 139
Non-trainable params: 0
__________________________________________________________________________________________________

编译和训练:

model.compile(loss='mae', optimizer='adam')
model.fit(dataset)

错误:

ValueError: Exception encountered when calling layer "string_lookup_24" (type StringLookup).
When output_mode is not `'int'`, maximum supported output rank is 2. Received output_mode one_hot and input shape (None, None), which would result in output rank 3.
Call arguments received:
• inputs=tf.Tensor(shape=(None, None), dtype=string)

我应该如何构建我的预处理器或预处理我的数据集，使其工作?谢谢你！

像这样的东西应该为您工作:

import tensorflow as tf
import numpy as np
import pandas as pd
x = pd.DataFrame({'col1': list('abcdefghij'), 'col2': np.arange(10), 'col3': np.arange(10)})
y = np.arange(10)
window_size = 3
dataset = tf.data.Dataset.from_tensor_slices((dict(x), y)).window(window_size, shift=1, drop_remainder=True)
# Extra preprocessing to get dict of tensors
dataset = dataset.flat_map(lambda window_x, window_y: tf.data.Dataset.zip({**{k: v.batch(window_size) for k, v in window_x.items()}, **{"y": window_y.batch(window_size)}}))
dataset = dataset.map(lambda data_dict: ({k: v for k, v  in data_dict.items() if k != 'y'}, data_dict["y"]))
vocab = sorted(set(x['col1']))
lookup = tf.keras.layers.StringLookup(vocabulary=vocab, output_mode='one_hot')
dataset = dataset.map(lambda i, j: ({'col1': lookup(i['col1']), 'col2': i['col2'], 'col3': i['col3']}, j)).batch(3)

你的模型:

inputs = {'col1': tf.keras.Input(shape=(window_size, lookup.vocabulary_size()), name='col1', dtype=tf.float32),
'col2': tf.keras.Input(shape=(3,), name='col2', dtype=tf.float32),
'col3': tf.keras.Input(shape=(3,), name='col3', dtype=tf.float32)}
numeric = tf.stack([inputs['col2'], inputs['col2']], axis=-1)
result = tf.concat([inputs['col1'], numeric], axis=-1)
preprocessor = tf.keras.Model(inputs, result)
body = tf.keras.models.Sequential([tf.keras.layers.Flatten(),
tf.keras.layers.Dense(8),
tf.keras.layers.Dense(window_size)])
x = preprocessor(inputs)
result = body(x)
model = tf.keras.Model(inputs, result)
model.summary()
model.compile(loss='mae', optimizer='adam')
model.fit(dataset)

我们可以完全省略预处理层而使用StringLookup层:

lookup_cols = ["col1"]
dataset = tf.data.Dataset.from_tensor_slices((dict(x), y)).window(
window_size, shift=1, drop_remainder=True
)
lookups = {
col: tf.keras.layers.StringLookup(
vocabulary=sorted(set(x[col])), output_mode="one_hot"
)
for col in lookup_cols
}
dataset = dataset.flat_map(
lambda x, y: tf.data.Dataset.zip(
({k: v.batch(window_size) for k, v in x.items()}, y.batch(window_size))
)
)
dataset = dataset.map(
lambda x, y: (
tf.concat(
[
tf.concat(
[lookups[k](v) for k, v in x.items() if k in lookup_cols], axis=-1
),
tf.stack(
[
tf.cast(v, dtype=tf.float32)
for k, v in x.items()
if k not in lookup_cols
],
axis=-1,
),
],
axis=-1,
),
y,
)
)
dataset = dataset.batch(3)
for i, j in dataset.take(1):
print(i, j)

输出:

tf.Tensor(
[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 2. 2.]]
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 2. 2.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 3. 3.]]
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 2. 2.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 3. 3.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 4. 4.]]], shape=(3, 3, 13), dtype=float32) tf.Tensor(
[[0 1 2]
[1 2 3]
[2 3 4]], shape=(3, 3), dtype=int64)

相关内容

最新更新

热门标签：