将数据保存为SequenceExample并读取LSTM的TFRecords



假设我有一个具有形状(样本、时间步长、特征(的三维数据集,我想用合适的tensroflow数据集格式保存它,例如使用SequenceExample进一步阅读TFRecord并在LSTM中使用。有一个数据集:

import numpy as np
import tensorflow as tf
data = np.array(
[
[
[1 , 10, ],
[2 , 11, ],
],
[
[2 , 11, ],
[3 , 12, ],
]
], dtype=np.float32)
y = np.array([101., 202.], dtype=np.float32)

有一个模型:

inputs= tf.keras.layers.Input(
shape=(2, 2),
name='input',
)
model = tf.keras.layers.LSTM(
units=data.shape[2],
return_sequences=False,
return_state=False,
name='lstm',
)(inputs)
model = tf.keras.layers.Dense(
units=1,
name='dense',
)(model)
outputs = model
loss = tf.keras.losses.MSE
model = tf.keras.Model(
inputs=inputs,
outputs=outputs,
name='model',
)
model.compile(
optimizer='rmsprop',
loss='mse',
metrics='mse',
)
model.summary()
model.fit(
x=data,
y=y,
batch_size=1,
)

让我们尝试使用tensorflow API保存和读取数据集:

# writer
options = tf.io.TFRecordOptions(
compression_type='ZLIB',
flush_mode=None,
input_buffer_size=None,
output_buffer_size=None,
window_bits=None,
compression_level=0,
compression_method=None,
mem_level=None,
compression_strategy=None,
)
writer = tf.io.TFRecordWriter(
path=r'test.tfrecord',
options=options,
)
# iterate over each row
for i in range(data.shape[0]):
# set example id
sample_dict = {
'index': tf.train.Feature(int64_list=tf.train.Int64List(value=[i]))
}
features_list = {}
# iterate over each feature
for c in range(data[0].shape[1]):
feature_values = [
_float_feature(v) for v in data[i][:, c]
]
features_list[str(c)] = tf.train.FeatureList(feature=feature_values)
# set example
example = tf.train.SequenceExample(
context=tf.train.Features(feature=sample_dict),
feature_lists=tf.train.FeatureLists(feature_list=features_list)
)
# write
writer.write(example.SerializeToString())
writer.close()
# read raw
data_raw = tf.data.TFRecordDataset(
filenames=[r'test.tfrecord'],
compression_type='ZLIB',
buffer_size=10*1024, # 10MB
num_parallel_reads=numexpr.detect_number_of_cores()-1,
)
# parse real
schema = dict(
zip(
[str(s) for s in range(data[0].shape[1])],
[tf.io.FixedLenSequenceFeature([], dtype=tf.float32)] * data[0].shape[1]
)
)
def decode_fn(record_bytes):
context, features = tf.io.parse_single_sequence_example(
serialized=record_bytes,
context_features={'index': tf.io.FixedLenFeature([], dtype=tf.int64)},
sequence_features=schema,
)
return features
# read real
for r in data_raw.map(decode_fn):
print(r, 'n')

当我试图用tensorflow数据集拟合模型时,它会给我一个错误

model.fit(
data_raw,
batch_size=1,
)
ValueError: Input 0 of layer lstm is incompatible with the layer: expected ndim=3, found ndim=0. Full shape received: []

我知道我没有给tensorflow数据集添加标签,但在这种情况下这无关紧要,因为数据集没有不可控制的形状。有人能帮我理解为什么我的代码错了,哪里错了吗?

我添加了解码函数,并将其映射到tfrecord数据集。这修复了一切。Thx。

最新更新