卷装载不支持路径内的Azure ML Pipeline Glob模式



我正在尝试使用Azure ML Python SDKv2运行Azure ML管道。管道的输入是一个数据资产,其数据源是默认的blob存储区。其路径为azureml:raw_data_v2:1,并且其类型为URI_FOLDER。运行管道时,我收到以下错误

[2022-11-03 16:10:29Z]作业失败,作业RunId为fca7d858-2b46-43bb-89e8-048161eafbe。错误:{"错误":{"代码":"用户错误","严重性":null,"消息":"不符合":"ArgumentError(InvalidArgument{argument:"arguments.path",应为:"内部为全局模式"卷装载不支持该路径。路径必须是直接路径文件或文件夹的路径,或以结尾'/[a082bb6b7b039486a52e2427040accec]或'/[514faf5a71b4f0f67374c388f37aa0d7]/[4a2030f3f4c5c8696e675e920aac45a]以匹配卷的整个内容\&";,实际:\"REDACTED("}\n"代码":"数据能力。UriMountSession。PyFuseError";,\n〃;目标":";,\n〃;类别":"UserError";,\n〃;error_details":[\n{\n"键":"不符合理由";,\n〃;值":"ArgumentError(InvalidArgument{argument:\quot;arguments.path \ quot;,应为:";路径内的Glob模式不受支持卷装载。路径必须是指向文件或文件夹的直接路径,或者结束与'/'或'//*'匹配,以匹配卷的整个内容\&";,实际:;REDACTED("\n},\n{\n"key":"StackTrace";,\n〃;值":"文件\"opt/miniconda/envs/data-cability/lib/python3.7/site packages/data_capability/cability_session.py\";,第70行,起始\n(data_path,sub_data_path(=session.start((\n\n文件\"opt/miniconda/envs/data capability/lib/python3.7/site packages/data_capability/data_sessions.py\";,第364行,起始\n options=mnt_options\n\n文件\"opt/miniconda/envs/data capability/lib/python3.7/site packages/azureml/dataprep/fuse/drepfuse.py\";,第696行,在rslex_uri_volume_mount中\n提升e文件\"opt/miniconda/envs/data capability/lib/python3.7/site packages/azureml/dataprep/fuse/drepfuse.py\";,第690行,在rslex_uri_volume_mount\n mount_context中=RslexDirectURIMountContext(mount_point,uri,options(\n"\n}\n]\n} ";,"消息格式":null;消息参数":{},";引用代码":null;DetailsUri":null;目标":null;详细信息":[],"内部错误":null;调试信息":null;AdditionalInfo":null},";相关性":null;环境":null;位置":无效的"时间":"0001-01-01T00:00:00+00:00"组件名称:null}

最重要的部分是

ArgumentError(InvalidArgument{argument:"arguments.path";,预期:";路径内的Glob模式不受支持卷装载。路径必须是指向文件或文件夹的直接路径,或者结束带有'/[a082bb6b7b039486a52e2427040accec]或'/[514faf5a71b4f0f67374c388f37aa0d7]/[4a2030f3f4c5c8696e675e920aac45a]以匹配卷的整个内容。

我假设这是在它试图将我的输入数据装载到将运行我的管道的docker容器时发生的,但不确定。这是我的管道的完整代码

from azure.identity import DefaultAzureCredential, ManagedIdentityCredential
from azure.ai.ml import MLClient, Input, Output, command
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.constants import AssetTypes
from mldesigner import command_component
import os
import pandas as pd
import numpy as np
import glob
from PIL import Image
import json
import pickle
os.environ['AZURE_TENANT_ID'] = 'xxx-xxx-xxx-xxx'
credential = DefaultAzureCredential()
# Check if given credential can get token successfully.
credential.get_token("https://management.azure.com/.default")
ml_client = MLClient.from_config(credential=credential)
def get_val_test_filenames(input_ml_path, dataset):
df = pd.read_csv(f'{input_ml_path}/evaluation/{dataset}_reference_slides.csv', encoding='utf-8')
slides = df['Slide_id'].to_list()
return slides
def create_id(path):
parts = path.split('/')
file_name = parts[-1][:-4]
hash_str = parts[-2]
doc = parts[-3]
id = f'{doc}__{hash_str}__{file_name}'
return id
def create_y_val(input_ml_path, val_files):
y_val = []
with open(f'{input_ml_path}/evaluation/golden_dev.json') as y_val_file:
val_dict = json.load(y_val_file)
for vf in val_files:
sim_list = val_dict[vf]
y_val.append(sim_list)
return y_val # this should be list of lists
# x_train, x_val, x_test, y_val
def create_no_hier_datasets(input_ml_path, output_ml_path):
print(f'************* inside create no hier datasets *********************')
train_dir = f'{input_ml_path}/raw/images/final_slides/'
val_slides = get_val_test_filenames(input_ml_path, 'val')
test_slides = get_val_test_filenames(input_ml_path, 'test')
x_train, x_val, x_test, y_val = [], [], [], []
cnt = 0
for filename in glob.iglob(train_dir + '**/thumb*.jpg', recursive=True):
if 'small' in filename:
continue

img_np = np.asarray(Image.open(filename))
if img_np.shape != (768, 1024, 3):
print(f'{img_np.shape} does not equal (768, 1024, 3)')
continue
id = create_id(filename)
if id in val_slides:
x_val.append(img_np)
y_val.append(filename)
elif id in test_slides:
x_test.append(img_np)
else:
x_train.append(img_np)

x_train_np = np.asarray(x_train)
x_val_np = np.asarray(x_val)
x_test_np = np.asarray(x_test)
y_val_list = create_y_val(input_ml_path, y_val)

np.save(f"{output_ml_path}/x_train.npy", x_train_np)
np.save(f"{output_ml_path}/x_val.npy", x_val_np)
np.save(f"{output_ml_path}/x_test.npy", x_test_np)
with open(f"{output_ml_path}/y_val.npy", 'wb') as fp:
pickle.dump(y_val_list, fp)

output_version = '1'
input_version = '1'
@command_component(
name="create_train_test_data",
version="1",
display_name="Create train and test data",
description="creates train and test data",
environment=dict(
conda_file="conda.yml",
image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04",
)
)
def create_train_test_data_component(
input_data: Input(type=AssetTypes.URI_FOLDER),
output_data: Output(type=AssetTypes.URI_FOLDER),
):
create_no_hier_datasets(input_data, output_data)

@pipeline(compute='cpu-cluster', description="pipeline to create train and test data")
def data_prep_pipeline(pipeline_input_data):
create_data_node = create_train_test_data_component(input_data=pipeline_input_data)

raw_data_ds = Input(type=AssetTypes.URI_FOLDER, path="azureml:raw_data_v2:1")
output_data_ds = Output(type=AssetTypes.URI_FOLDER, path="azureml:train_test_data:1")
pipeline_job = data_prep_pipeline(pipeline_input_data=raw_data_ds)
pipeline_job = ml_client.jobs.create_or_update(pipeline_job, experiment_name="no_hierarchy")

由于错误似乎发生在我的代码开始运行之前,我真的不明白出了什么问题。有人有这方面的经验吗?

根据设计,uri_folder不支持在路径中进行globbing。

你需要在uri路径中进行globbing的场景是什么?

最新更新