Nan在使用tensorflow训练深度神经推荐模型时丢失



我正在尝试遵循tensorflow文档,并将相同的技术应用于其中一个toy数据集。

在训练中,作为楠,我感到很失落。我已经尝试使用Debugger V2进行调试,我可以看到tf.keras.layers.GlobalAveragePooling1D由于除以0而给出Nan,这导致在反向传播过程中所有值都是Nan。但是从调试器V2 GUI中不清楚的是,为什么总和变为0。我确实试图减少功能的数量和数据集的大小,但每一项活动都会给我带来新的错误(可能我会在稍后为每个问题启动一个单独的问题线程(。

以下是供参考的代码。我在这里也提供了数据集。我在谷歌Colab上试过下面的代码。

import os
import pprint
import tempfile
from typing import Dict, Text
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
tf.debugging.experimental.enable_dump_debug_info(
"./tfdbg2_logdir",
tensor_debug_mode="FULL_HEALTH",
circular_buffer_size=-1)
!pip install -q tensorflow-recommenders
import tensorflow_recommenders as tfrs  

准备数据

ds=pd.read_csv('train_recom.csv')
ds['year'].replace(0,1,inplace=True)
ds_song=ds.groupby(['song_id','title','release','artist_name','year']).size().reset_index().rename(columns={0:'count'})
ds_song.to_csv('songs_details.csv')
ds.to_csv('train_recom_transformed.csv')

正在将数据读取到tensorflow数据集

ratings = tf.data.experimental.make_csv_dataset(
"./train_recom_transformed.csv",
batch_size=5,
select_columns=['user_id', 'song_id', 'listen_count', 'title', 'release', 'artist_name',
'year'],
header=True,
num_epochs=1,
ignore_errors=False,)
songs = tf.data.experimental.make_csv_dataset(
"./songs_details.csv",
batch_size=128,
select_columns=['song_id','title','release','artist_name','year'],
num_epochs=1,
ignore_errors=True,)
ratings = ratings.unbatch().map(lambda x: {
"song_id": x["song_id"],
"user_id": x["user_id"],
"release" : x["release"],
"artist_name" : x["artist_name"],
"title" : x["title"],
"year" : x["year"],
"listen_count": x["listen_count"]
})
songs = songs.unbatch().map(lambda x: x["song_id"]) 

准备训练和测试数据集

tf.random.set_seed(42)
shuffled = ratings.shuffle(16000, seed=42, reshuffle_each_iteration=False)
train = shuffled.take(12000)
test = shuffled.skip(12000).take(4000)
cached_train = train.shuffle(100_000).batch(1200).cache()
cached_test = test.batch(400).cache()
title = songs.batch(1000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])
unique_song_titles = np.unique(np.concatenate(list(title)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))
year_data=np.concatenate(list(ratings.map(lambda x: x['year']).batch(4000)))

用户模型类

class UserModel(tf.keras.Model):
def __init__(self):
super().__init__()
max_tokens = 1_000_000
embedding_dimension = 32
self.user_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_user_ids, mask_token=None),
tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])
self.release_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=max_tokens)

self.release_text_embedding = tf.keras.Sequential([
self.release_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True,input_length=144),
tf.keras.layers.GlobalAveragePooling1D(),
])
self.release_vectorizer.adapt(np.concatenate(list(ratings.map(lambda x: x['release']).batch(4000))))
self.artist_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=max_tokens)
self.artist_text_embedding = tf.keras.Sequential([
self.artist_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
tf.keras.layers.GlobalAveragePooling1D(),
])
self.artist_vectorizer.adapt(np.concatenate(list(ratings.map(lambda x: x['artist_name']).batch(4000))))

self.title_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=max_tokens)
self.title_text_embedding = tf.keras.Sequential([
self.title_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
tf.keras.layers.GlobalAveragePooling1D(),
])
self.title_vectorizer.adapt(np.concatenate(list(ratings.map(lambda x: x['title']).batch(4000))))

self.year_embedding = tf.keras.Sequential([
tf.keras.layers.Embedding(len(year_data) + 1, 32),
])
def call(self, inputs):
return tf.concat([
self.user_embedding(inputs['user_id']),
self.release_text_embedding(inputs['release'])
,
self.year_embedding(inputs['year']), 
self.artist_text_embedding(inputs['artist_name']),
self.title_text_embedding(inputs['title']),
], axis=1)

商品型号

class ItemModel(tf.keras.Model):
def __init__(self):
super().__init__()
max_tokens = 10_000
embedding_dimension = 32
## embed title from unique_song_titles
self.title_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_song_titles, mask_token=None),
tf.keras.layers.Embedding(len(unique_song_titles) + 1, embedding_dimension)
])
def call(self, inputs):
return self.title_embedding(inputs)

查询模型。创建深度模型

class QueryModel(tf.keras.Model):
"""Model for encoding user queries."""
def __init__(self, layer_sizes):
"""Model for encoding user queries.
Args:
layer_sizes:
A list of integers where the i-th entry represents the number of units
the i-th layer contains.
"""
super().__init__()
# We first use the user model for generating embeddings.
self.embedding_model = UserModel()
# Then construct the layers.
self.dense_layers = tf.keras.Sequential()
# Use the ReLU activation for all but the last layer.
for layer_size in layer_sizes[:-1]:
self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))
# No activation for the last layer.
for layer_size in layer_sizes[-1:]:
self.dense_layers.add(tf.keras.layers.Dense(layer_size))
def call(self, inputs):
feature_embedding = self.embedding_model(inputs)
return self.dense_layers(feature_embedding)

为项目模型创建深度模型

class CandidateModel(tf.keras.Model):
"""Model for encoding movies."""
def __init__(self, layer_sizes):
"""Model for encoding movies.
Args:
layer_sizes:
A list of integers where the i-th entry represents the number of units
the i-th layer contains.
"""
super().__init__()
self.embedding_model = ItemModel()
# Then construct the layers.
self.dense_layers = tf.keras.Sequential()
# Use the ReLU activation for all but the last layer.
for layer_size in layer_sizes[:-1]:
self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))
# No activation for the last layer.
for layer_size in layer_sizes[-1:]:
self.dense_layers.add(tf.keras.layers.Dense(layer_size))
def call(self, inputs):
feature_embedding = self.embedding_model(inputs)
return self.dense_layers(feature_embedding)

结合查询和候选模型

class SongModel(tfrs.models.Model):
def __init__(self, layer_sizes):
super().__init__()
self.query_model = QueryModel(layer_sizes)
self.candidate_model = CandidateModel(layer_sizes)
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=songs.batch(128).map(self.candidate_model),
),
)
def compute_loss(self, features, training=False):
print('type of feature ----',type(features))
query_embeddings = self.query_model({
"user_id": features["user_id"]
,
"release" : features["release"]
,
"artist_name" : features["artist_name"],
"title": features["title"],
"year" : features["year"],
})
item_embeddings = self.candidate_model(features["song_id"])
return self.task(query_embeddings, item_embeddings)

训练模型

model = SongModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model_hist = model.fit(cached_train, epochs=9)

下面是我得到的结果

WARNING:tensorflow:Failed to read source code from path: /content/<ipython-input-26-fdc864fc30cf>. Reason: Source path neither exists nor can be loaded as a .par file: /content/<ipython-input-26-fdc864fc30cf>
WARNING:tensorflow:Failed to read source code from path: /content/<ipython-input-25-e3009db55439>. Reason: Source path neither exists nor can be loaded as a .par file: /content/<ipython-input-25-e3009db55439>
Epoch 1/9
type of feature ---- <class 'dict'>
WARNING:tensorflow:Model was constructed with shape (None, None) for input KerasTensor(type_spec=TensorSpec(shape=(None, None), dtype=tf.float32, name='embedding_10_input'), name='embedding_10_input', description="created by layer 'embedding_10_input'"), but it was called on an input with incompatible shape (None,).
type of feature ---- <class 'dict'>
WARNING:tensorflow:Model was constructed with shape (None, None) for input KerasTensor(type_spec=TensorSpec(shape=(None, None), dtype=tf.float32, name='embedding_10_input'), name='embedding_10_input', description="created by layer 'embedding_10_input'"), but it was called on an input with incompatible shape (None,).
10/10 [==============================] - 63s 1s/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0022 - factorized_top_k/top_10_categorical_accuracy: 0.0033 - factorized_top_k/top_50_categorical_accuracy: 0.0073 - factorized_top_k/top_100_categorical_accuracy: 0.0103 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 2/9
10/10 [==============================] - 9s 945ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 3/9
10/10 [==============================] - 10s 953ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 4/9
10/10 [==============================] - 9s 948ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 5/9
10/10 [==============================] - 10s 966ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 6/9
10/10 [==============================] - 10s 955ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 7/9
10/10 [==============================] - 10s 955ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 8/9
10/10 [==============================] - 10s 958ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan
Epoch 9/9
10/10 [==============================] - 10s 971ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: nan - regularization_loss: 0.0000e+00 - total_loss: nan

在自定义数据集上使用tfrs时,我遇到了类似的错误。事实证明,我在数据中有一些非打印字符和系统号。我只是简单地搜索并删除了符号(手动,一些正则表达式(,并且还将数据帧中的文本列限制为仅可打印字符。

from string import printable as pt
allowed_set = set(pt)
df[col] = df[col].apply(lambda x:  ''.join([' ' if  s not in  allowed_set else s for s in x]))

我希望它能有所帮助。

问题是,当我们用空格替换特殊字符时,对于一条记录,整个数据变为NULL(对于发布字段(。总之,这是数据问题,而不是代码问题。然后,我们在下面添加了两行来处理这种情况ds.replace(r'^s*$', 'None', regex=True)。以下是进行的所有更改的完整代码

import os
import pprint
import tempfile
from typing import Dict, Text
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
!pip install -q tensorflow-recommenders
import tensorflow_recommenders as tfrs  
ds=pd.read_csv('train_recom.csv')
print(ds['release'].isnull().sum())
print(ds['title'].isnull().sum())
print(ds['artist_name'].isnull().sum())
print(ds['year'].isnull().sum())
print(ds.isna().any(axis=None))
print(any(ds[c].hasnans for c in ds))
for c in ds:
if ds[c].hasnans:
print(c)
ds['year'].replace(0,1,inplace=True)
ds.release.replace({r'[^a-zA-Z0-9 ]+':''}, regex=True, inplace=True)
ds.artist_name.replace({r'[^a-zA-Z0-9 ]+':''}, regex=True, inplace=True)
ds.title.replace({r'[^a-zA-Z0-9 ]+':''}, regex=True, inplace=True)
ds2=ds.replace(r'^s*$', np.nan, regex=True)
ds2['release']=ds2['release'].fillna('None')
ds=ds2
ds_song=ds.groupby(['song_id','title','release','artist_name','year']).size().reset_index().rename(columns={0:'count'})
ds_song.to_csv('songs_details.csv')
ds.to_csv('train_recom_transformed.csv')
ratings = tf.data.experimental.make_csv_dataset(
"./train_recom_transformed.csv",
batch_size=5,
select_columns=['user_id', 'song_id', 'listen_count', 'title', 'release', 'artist_name',
'year'],
header=True,
num_epochs=1,
ignore_errors=False,)
songs = tf.data.experimental.make_csv_dataset(
"./songs_details.csv",
batch_size=128,
select_columns=['song_id','title','release','artist_name','year'],
num_epochs=1,
ignore_errors=True,)
ratings = ratings.unbatch().map(lambda x: {
"song_id": x["song_id"],
"user_id": x["user_id"],
"release" : x["release"],
"artist_name" : x["artist_name"],
"title" : x["title"],
"year" : x["year"],
"listen_count": x["listen_count"]
})
songs = songs.unbatch().map(lambda x: {
"song_id":x["song_id"],
"release":x["release"],
"artist_name":x["artist_name"],
"title":x["title"],
"year":x["year"],
}) 
tf.random.set_seed(42)
shuffled = ratings.shuffle(16000, seed=42, reshuffle_each_iteration=False)
train = shuffled.take(12000)
test = shuffled.skip(12000).take(4000)
cached_train = train.shuffle(100_000).batch(1200).cache()
cached_test = test.batch(400).cache()
title = songs.batch(1000).map(lambda x: x["title"])
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])
unique_song_titles = np.unique(np.concatenate(list(title)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))
year_data=list(songs.map(lambda x: x['year']))
class UserModel(tf.keras.Model):
def __init__(self):
super().__init__()
max_tokens = 1_000_000
embedding_dimension = 32
self.user_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_user_ids, mask_token=None),
tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

def call(self, inputs):
return self.user_embedding(inputs['user_id'])
class ItemModel(tf.keras.Model):
def __init__(self):
super().__init__()
max_tokens = 10_000_00
embedding_dimension = 32
## embed title from unique_song_titles
self.title_embedding = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_song_titles, mask_token=None),
tf.keras.layers.Embedding(len(unique_song_titles) + 1, embedding_dimension)
])
self.release_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=max_tokens)

self.release_text_embedding = tf.keras.Sequential([
self.release_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True,input_length=144),
tf.keras.layers.GlobalAveragePooling1D(),
])
self.release_vectorizer.adapt(songs.map(lambda x: x['release']))
self.artist_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=max_tokens)
self.artist_text_embedding = tf.keras.Sequential([
self.artist_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
tf.keras.layers.GlobalAveragePooling1D(),
])
self.artist_vectorizer.adapt(songs.map(lambda x: x['artist_name']))

self.title_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
max_tokens=max_tokens)
self.title_text_embedding = tf.keras.Sequential([
self.title_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
tf.keras.layers.GlobalAveragePooling1D(),
])
self.title_vectorizer.adapt(songs.map(lambda x: x['title']))

self.year_embedding = tf.keras.Sequential([
tf.keras.layers.Embedding(len(year_data) + 1, 32),
# tf.keras.layers.Embedding(2501, 32),
])

def call(self, inputs):
# return self.title_embedding(inputs['title'])
return tf.concat([
self.title_embedding(inputs['title']),
self.release_text_embedding(inputs['release'])
,
self.year_embedding(inputs['year']), 
self.artist_text_embedding(inputs['artist_name']),
self.title_text_embedding(inputs['title']),
], axis=1)
class QueryModel(tf.keras.Model):
"""Model for encoding user queries."""
def __init__(self, layer_sizes):
"""Model for encoding user queries.
Args:
layer_sizes:
A list of integers where the i-th entry represents the number of units
the i-th layer contains.
"""
super().__init__()
# We first use the user model for generating embeddings.
self.embedding_model = UserModel()
# Then construct the layers.
self.dense_layers = tf.keras.Sequential()
# Use the ReLU activation for all but the last layer.
for layer_size in layer_sizes[:-1]:
self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))
# No activation for the last layer.
for layer_size in layer_sizes[-1:]:
self.dense_layers.add(tf.keras.layers.Dense(layer_size))
def call(self, inputs):
feature_embedding = self.embedding_model(inputs)
return self.dense_layers(feature_embedding)
class CandidateModel(tf.keras.Model):
"""Model for encoding movies."""
def __init__(self, layer_sizes):
"""Model for encoding movies.
Args:
layer_sizes:
A list of integers where the i-th entry represents the number of units
the i-th layer contains.
"""
super().__init__()
self.embedding_model = ItemModel()
# Then construct the layers.
self.dense_layers = tf.keras.Sequential()
# Use the ReLU activation for all but the last layer.
for layer_size in layer_sizes[:-1]:
self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))
# No activation for the last layer.
for layer_size in layer_sizes[-1:]:
self.dense_layers.add(tf.keras.layers.Dense(layer_size))
def call(self, inputs):
feature_embedding = self.embedding_model(inputs)
return self.dense_layers(feature_embedding)
class SongModel(tfrs.models.Model):
def __init__(self, layer_sizes):
super().__init__()
self.query_model = QueryModel(layer_sizes)
self.candidate_model = CandidateModel(layer_sizes)
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=songs.batch(128).map(self.candidate_model),
),
)
def compute_loss(self, features, training=False):
print('type of feature ----',type(features))
query_embeddings = self.query_model({
"user_id": features["user_id"]
,
})
item_embeddings = self.candidate_model({            
"song_id": features["song_id"],
"title" : features["title"],
"release" : features["release"]
,
"artist_name" : features["artist_name"],
"title": features["title"],
"year" : features["year"],
})
return self.task(query_embeddings, item_embeddings)
model = SongModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model_hist = model.fit(cached_train, epochs=9)

最新更新