假设这是数据库
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
############
### DATA ###
############
TrainingData = { 'name': ['Alex', 'Ben', 'Marry','Alex', 'Ben', 'Marry'],
'teacher': [1,0,0,1,0,0],
'doctor': [0,1,0,0,1,0],
'engineer': [0,0,1,0,0,1],
'age': [27, 32, 78,27, 32, 78],
'weight': [160, 209, 130,164, 206, 132],
'date': [1,1,1,2,2,2]}
TestData = {'name': ['Alex', 'Ben', 'Marry'],
'teacher': [1,0,0],
'doctor': [0,1,0],
'engineer': [0,0,1],
'age': [np.NaN,np.NaN,np.NaN],
'weight': [np.NaN,np.NaN,np.NaN],
'data': [3,3,3]}
# Convert to pandas dataframe
dfTraining = pd.DataFrame(TrainingData)
dfTest = pd.DataFrame(TestData)
# Print
print(dfTraining)
print(dfTest)
火车:
name teacher doctor engineer age weight date
0 Alex 1 0 0 27 160 1
1 Ben 0 1 0 32 209 1
2 Marry 0 0 1 78 130 1
3 Alex 1 0 0 27 164 2
4 Ben 0 1 0 32 206 2
5 Marry 0 0 1 78 132 2
测试:
name teacher doctor engineer age weight data
0 Alex 1 0 0 NaN NaN 3
1 Ben 0 1 0 NaN NaN 3
2 Marry 0 0 1 NaN NaN 3
我将它们更改为numpy以准备ML模型:
Y=df_train.groupby('name')['weight'].apply(lambda x: (x.to_numpy()))
df_train_x=df_train.drop('weight', axis=1)
X= df_train_x.groupby('name').apply(lambda x: (x.to_numpy()))
K=1
df_test_x=df_test.drop('weight', axis=1)
X_pred_null=df_test_x.groupby('name').apply(lambda x: (x.notnull()))
PresentVariables = (X_pred_null.to_numpy())
现在我想从X中排除空列,但是当我使用
时NearestNeighbor = KNeighborsRegressor(n_neighbors=K).fit(X[:, PresentVariables[0]], Y)
给出如下错误:
KeyError Traceback (most recent call last)
Input In [23], in <cell line: 1>()
----> 1 NearestNeighbor = KNeighborsRegressor(n_neighbors=K).fit(X[:, PresentVariables[0]], Y)
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/pandas/core/series.py:906, in Series.__getitem__(self, key)
903 key = np.asarray(key, dtype=bool)
904 return self._get_values(key)
--> 906 return self._get_with(key)
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/pandas/core/series.py:921, in Series._get_with(self, key)
916 raise TypeError(
917 "Indexing a Series with DataFrame is not "
918 "supported, use the appropriate DataFrame column"
919 )
920 elif isinstance(key, tuple):
--> 921 return self._get_values_tuple(key)
923 elif not is_list_like(key):
924 # e.g. scalars that aren't recognized by lib.is_scalar, GH#32684
925 return self.loc[key]
File /anaconda/envs/azureml_py38/lib/python3.8/site-packages/pandas/core/series.py:956, in Series._get_values_tuple(self, key)
953 return result
955 if not isinstance(self.index, MultiIndex):
--> 956 raise KeyError("key of type tuple not found and not a MultiIndex")
958 # If key is contained, would have returned by now
959 indexer, new_index = self.index.get_loc_level(key)
KeyError: 'key of type tuple not found and not a MultiIndex'
错误是X[:, PresentVariables[0]]这里我想从数据帧传输到numpy数组中删除具有空值的列谢谢你的帮助,谢谢
初始化df
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
############
### DATA ###
############
TrainingData = { 'name': ['Alex', 'Ben', 'Marry','Alex', 'Ben', 'Marry'],
'teacher': [1,0,0,1,0,0],
'doctor': [0,1,0,0,1,0],
'engineer': [0,0,1,0,0,1],
'age': [27, 32, 78,27, 32, 78],
'weight': [160, 209, 130,164, 206, 132],
'date': [1,1,1,2,2,2]}
TestData = {'name': ['Alex', 'Ben', 'Marry'],
'teacher': [1,0,0],
'doctor': [0,1,0],
'engineer': [0,0,1],
'age': [np.NaN,np.NaN,np.NaN],
'weight': [np.NaN,np.NaN,np.NaN],
'data': [3,3,3]}
# Convert to pandas dataframe
df_train = pd.DataFrame(TrainingData)
df_test = pd.DataFrame(TestData)
将权重列指定为目标
y_train = df_train["weight"]
通过排除体重和年龄列创建x_train。
x_train = df_train.drop(["weight", "age"], axis=1)
列名包含字符串输入。通过标签编码
将其转换为数字from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(x_train["name"])
x_train["name"] = le.transform(x_train["name"])
x_train = x_train.to_numpy()
适合模型
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(x_train, y_train)
在test中也应用相同的转换
x_test = df_test.drop(["age", "weight"], axis=1)
x_test["name"] = le.transform(x_test["name"])
x_test = x_test.to_numpy()
运行预测
print(neigh.predict(x_test))
对于age列,我在训练中删除了它,因为您试图删除空值。
可以在训练时保留age列。但是在预测步骤之前输入年龄列(因为年龄列在测试数据框中有空值)。