我正在使用Google Cloud DLP来检查BigQuery中的敏感数据。我想知道是否可以使用一个 dlpJob 检查数据集中的所有表?如果是这样,我应该如何设置配置?
我试图省略配置中的 BQ tableId 字段。但它会返回http 400错误"必须设置table_id
"。这是否意味着使用一个 dlpJob,只能检查一个表,并且要扫描多个表,我们需要多个 dlpJobs?或者有没有办法使用一些正则表达式技巧扫描同一数据集中的多个表?
目前,一个作业只扫描一个表。该团队正在开发该功能 - 同时,您可以使用粗略的 shell 脚本手动创建作业,就像我在下面放置的那样,该脚本结合了 gcloud 和对 dlp API 的其余调用。您可能可以使用云函数做一些更流畅的事情。
先决条件: 1. 安装 gcloud。https://cloud.google.com/sdk/install 2. 使用以下参数运行此脚本: 3.
1.扫描大查询表的project_id。 2. 要将结果存储到的输出表的数据集 ID。 3. 要存储结果的输出表的表 ID。 4. 表示要扫描的行百分比的数字。
# 示例: # ./inspect_all_bq_tables.sh dlapi-test findings_daataset
# Reports a status of execution message to the log file and serial port
function report() {
local tag="${1}"
local message="${2}"
local timestamp="$(date +%s)000"
echo "${timestamp} - ${message}"
}
readonly -f report
# report_status_update
#
# Reports a status of execution message to the log file and serial port
function report_status_update() {
report "${MSGTAG_STATUS_UPDATE}" "STATUS=${1}"
}
readonly -f report_status_update
# create_job
#
# Creates a single dlp job for a given bigquery table.
function create_dlp_job {
local dataset_id="$1"
local table_id="$2"
local create_job_response=$(curl -s -H
"Authorization: Bearer $(gcloud auth print-access-token)"
-H "X-Goog-User-Project: $PROJECT_ID"
-H "Content-Type: application/json"
"$API_PATH/v2/projects/$PROJECT_ID/dlpJobs"
--data '
{
"inspectJob":{
"storageConfig":{
"bigQueryOptions":{
"tableReference":{
"projectId":"'$PROJECT_ID'",
"datasetId":"'$dataset_id'",
"tableId":"'$table_id'"
},
"rowsLimitPercent": "'$PERCENTAGE'"
},
},
"inspectConfig":{
"infoTypes":[
{
"name":"ALL_BASIC"
}
],
"includeQuote":true,
"minLikelihood":"LIKELY"
},
"actions":[
{
"saveFindings":{
"outputConfig":{
"table":{
"projectId":"'$PROJECT_ID'",
"datasetId":"'$FINDINGS_DATASET_ID'",
"tableId":"'$FINDINGS_TABLE_ID'"
},
"outputSchema": "BASIC_COLUMNS"
}
}
},
{
"publishFindingsToCloudDataCatalog": {}
}
]
}
}')
if [[ $create_job_response != *"dlpJobs"* ]]; then
report_status_update "Error creating dlp job: $create_job_response"
exit 1
fi
local new_dlpjob_name=$(echo "$create_job_response"
head -5 | grep -Po '"name": *K"[^"]*"' | tr -d '"' | head -1)
report_status_update "DLP New Job: $new_dlpjob_name"
}
readonly -f create_dlp_job
# List the datasets for a given project. Once we have these we can list the
# tables within each one.
function create_jobs() {
# The grep pulls the dataset id. The td removes the quotation marks.
local list_datasets_response=$(curl -s -H
"Authorization: Bearer $(gcloud auth print-access-token)" -H
"Content-Type: application/json"
"$BIGQUERY_PATH/projects/$PROJECT_ID/datasets")
if [[ $list_datasets_response != *"kind"* ]]; then
report_status_update "Error listing bigquery datasets: $list_datasets_response"
exit 1
fi
local dataset_ids=$(echo $list_datasets_response
| grep -Po '"datasetId": *K"[^"]*"' | tr -d '"')
# Each row will look like "datasetId", with the quotation marks
for dataset_id in ${dataset_ids}; do
report_status_update "Looking up tables for dataset $dataset_id"
local list_tables_response=$(curl -s -H
"Authorization: Bearer $(gcloud auth print-access-token)" -H
"Content-Type: application/json"
"$BIGQUERY_PATH/projects/$PROJECT_ID/datasets/$dataset_id/tables")
if [[ $list_tables_response != *"kind"* ]]; then
report_status_update "Error listing bigquery tables: $list_tables_response"
exit 1
fi
local table_ids=$(echo "$list_tables_response"
| grep -Po '"tableId": *K"[^"]*"' | tr -d '"')
for table_id in ${table_ids}; do
report_status_update "Creating DLP job to inspect table $table_id"
create_dlp_job "$dataset_id" "$table_id"
done
done
}
readonly -f create_jobs
PROJECT_ID=$1
FINDINGS_DATASET_ID=$2
FINDINGS_TABLE_ID=$3
PERCENTAGE=$4
API_PATH="https://dlp.googleapis.com"
BIGQUERY_PATH="https://www.googleapis.com/bigquery/v2"
# Main
create_jobs