使用AWS Step Function工作流进行预处理和特征工程



我正在尝试在AWS中使用步进函数实现机器学习工作流。我的代码在一个Jupyter笔记本实例中。工作流的预处理和特征工程部分是我在ProcessingStep步骤定义中调用的单独Python程序中的代码。

预处理步骤应该做一些数据类型转换,将原始数据拆分为训练和测试文件,并将这些文件保存在定义的s3桶文件夹中。当我执行工作流时,下面给出的预处理步骤成功运行,但是,它没有在定义的s3文件夹路径中创建和保存培训和测试文件。

为什么会发生这种情况,如何修复?

%%writefile preprocessing.py
import argparse
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action="ignore", category=DataConversionWarning)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--train-test-split-ratio", type=float, default=0.3)
args, _ = parser.parse_known_args()
print("Received arguments {}".format(args))
input_data_path = os.path.join("/opt/ml/processing/input", "raw-data.csv")

print("Reading input data from {}".format(input_data_path))
df = pd.read_csv(input_data_path)

# Handle null values
df['Gender'][df['Gender'].isnull()]='Male'
df['Married'][df['Married'].isnull()]='Yes'
df['LoanAmount'][df['LoanAmount'].isnull()]= df['LoanAmount'].mean()
df['Loan_Amount_Term'][df['Loan_Amount_Term'].isnull()]='360'
df['Loan_Amount_Term'][df['Loan_Amount_Term'].isnull()]='360'
df['Loan_Amount_Term'][df['Loan_Amount_Term'].isnull()]='360'
df['Loan_Amount_Term'][df['Loan_Amount_Term'].isnull()]='360'
df['Self_Employed'][df['Self_Employed'].isnull()]='No'
df['Credit_History'][df['Credit_History'].isnull()]='1'
df['Dependents'][df['Dependents'].isnull()]='0'
df.loc[df.Dependents=='3+','Dependents']= 4
# Convert data types to numeric
df.loc[df.Loan_Status=='N','Loan_Status']= 0
df.loc[df.Loan_Status=='Y','Loan_Status']=1
df.loc[df.Gender=='Male','Gender']= 0
df.loc[df.Gender=='Female','Gender']=1
df.loc[df.Married=='No','Married']= 0
df.loc[df.Married=='Yes','Married']=1
df.loc[df.Education=='Graduate','Education']= 0
df.loc[df.Education=='Not Graduate','Education']=1
df.loc[df.Self_Employed=='No','Self_Employed']= 0
df.loc[df.Self_Employed=='Yes','Self_Employed']=1
#property_area = pd.get_dummies(df['Property_Area'],drop_first=True)
#df = pd.concat([df,property_area],axis=1)
df['Married']          = df['Married'].astype(str).astype(int)
df['Dependents']       = df['Dependents'].astype(str).astype(int)
df['Education']        = df['Education'].astype(str).astype(int)
df['Self_Employed']    = df['Self_Employed'].astype(str).astype(int)
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].astype(str).astype(float)
df['Credit_History']   = df['Credit_History'].astype(str).astype(float)
df['Loan_Status']      = df['Loan_Status'].astype(str).astype(int)
#df['Semiurban']        = df['Semiurban'].astype(str).astype(int)
#df['Urban']            = df['Urban'].astype(str).astype(int)
df = df.drop('Loan_ID', 1)

split_ratio = args.train_test_split_ratio
print("Splitting data into train and test sets with ratio {}".format(split_ratio))
X_train, X_test, y_train, y_test = train_test_split(
df.drop("Loan_Status", axis=1), df["Loan_Status"], test_size=split_ratio, random_state=0)

print("Running preprocessing and feature engineering transformations")
train_features = OneHotEncoder().fit_transform(X_train)
test_features = OneHotEncoder().fit_transform(X_train)

print("Train data shape after preprocessing: {}".format(train_features.shape))
print("Test data shape after preprocessing: {}".format(test_features.shape))

train_features_output_path = os.path.join("/opt/ml/processing/train", "train_features.csv")
train_labels_output_path = os.path.join("/opt/ml/processing/train", "train_labels.csv")
test_features_output_path = os.path.join("/opt/ml/processing/test", "test_features.csv")
test_labels_output_path = os.path.join("/opt/ml/processing/test", "test_labels.csv")
print("Saving training features to {}".format(train_features_output_path))
pd.DataFrame(train_features).to_csv(train_features_output_path, header=False, index=False)
print("Saving test features to {}".format(test_features_output_path))
pd.DataFrame(test_features).to_csv(test_features_output_path, header=False, index=False)
print("Saving training labels to {}".format(train_labels_output_path))
y_train.to_csv(train_labels_output_path, header=False, index=False)
print("Saving test labels to {}".format(test_labels_output_path))
y_test.to_csv(test_labels_output_path, header=False, index=False)

如果这是一个外部python脚本,请尝试使用- AWS SDK Boto3

JSON文件示例

import boto3
s3 = boto3.resource('s3')
your_bucketname = 'xyz_bucket'
bucket = s3.Bucket(your_bucketname)
## PUT
s3object = s3.Object(your_bucketname, 'abc.json')
s3object.put(
Body=(bytes(json.dumps(Analytics).encode('UTF-8')))
)

最新更新