如何使用Python在运行时通过ApacheBeam创建分区表



我正试图用以下代码在运行时创建一个新的分区Bigquery表,但我没有获得传递列名的选项&quot_时间";需要在我的新BQ表上完成哪个分区。有谁能帮我一下吗?

我的代码

#------------Import Lib-----------------------#
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
import os, sys
import argparse
import logging
from apache_beam.options.pipeline_options import SetupOptions
from datetime import datetime
#------------Set up BQ parameters-----------------------#
# Replace with Project Id
project = 'xxxx'
#plitting Of Records----------------------#
class Transaction_DB_UC2(beam.DoFn):
def process(self, element):
logging.info(element)
result = json.loads(element)
data_time = result.get('_time', 'null')
data_dest = result.get('dest', 'null')
data_DBID = result.get('DBID', 'null')
data_SESSIONID = result.get('SESSIONID', 'null')
data_USERHOST = result.get('USERHOST', 'null')
data_raw = result.get('_raw', 'null')
data_ACTION = result.get('ACTION', 'null')
data_host = result.get('host', 'null')
data_result = result.get('result', 'null')
data_DBUSER = result.get('DBUSER', 'null')
data_OS_USERNAME = result.get('OS_USERNAME', 'null')
data_ACTION_NAME = result.get('ACTION', 'null').replace('100','LOGON').replace('101','LOGOFF')
return [{"_time": data_time[:-8], "dest": data_dest,  "DBID": data_DBID,  "SESSIONID": data_SESSIONID,  "_raw": data_raw,  "USERHOST": data_USERHOST,  "ACTION": data_ACTION, "host": data_host, "result": data_result, "DBUSER": data_DBUSER, "OS_USERNAME": data_OS_USERNAME, "ACTION_NAME": data_ACTION_NAME}]
def run(argv=None, save_main_session=True):
parser = argparse.ArgumentParser()
parser.add_argument(
'--input',
dest='input',
help='Input file to process.')
parser.add_argument(
'--pro_id',
dest='pro_id',
type=str,
default='ORACLE_SEC_DEFAULT',
help='project id')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
p1 = beam.Pipeline(options=pipeline_options)
#data_f = sys.argv[1]
logging.info('***********')
logging.info(known_args.input)
data_loading = (
p1
|'Read from File' >> beam.io.ReadFromText(known_args.input,skip_header_lines=0)
)

project_id = "xxxxx"
dataset_id = 'test123'
table_schema_DB_UC2 = ('_time:DATETIME, dest:STRING, DBID:STRING, SESSIONID:STRING, _raw:STRING, USERHOST:STRING, ACTION:STRING, host:STRING, result:STRING, DBUSER:STRING, OS_USERNAME:STRING, ACTION_NAME:STRING')
# Persist to BigQuery
# WriteToBigQuery accepts the data as list of JSON objects
#---------------------Index = DB-UC2----------------------------------------------------------------------------------------------------------------------
result = (
data_loading
| 'Clean-DB-UC2' >> beam.ParDo(Transaction_DB_UC2())
| 'Write-DB-UC2' >> beam.io.WriteToBigQuery(
table=known_args.pro_id,
dataset=dataset_id,
project=project_id,
schema=table_schema_DB_UC2,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
result = p1.run()
result.wait_until_finish()

if __name__ == '__main__':
#logging.getLogger().setLevel(logging.INFO)
path_service_account = 'ml-fbf8cabcder.json'
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = path_service_account
run()

我想在文件"上创建分区_时间";,请建议如何实现。谢谢

我相信使用additional_bq_parameters(注意限制(和timePartitioning参数可以做到这一点。

创建新的BigQuery表时,有许多额外的参数这可能需要具体说明。例如,集群、分区、数据可以通过以下方式提供这些附加参数:传递一个Python字典作为additional_bq_parameters(引用(

在您的情况下,您可以在WriteToBigQuery转换中添加timePartitioning参数,其中包含必需的type和可选的field字段(请注意,field必须是顶级TIMESTAMP或DATE字段(:

additional_bq_parameters={'timePartitioning': {
'type': 'DAY',
'field': '_time'
}}

我还没来得及试一下。我明天会努力复制
如果它对你有效,请告诉我。

编辑

最后有机会尝试使用timePartitioning参数来创建分区表,并且成功了
这里有一个简单的管道代码来测试它。

#!/usr/bin/env python
import apache_beam as beam
PROJECT='YOUR_PROJECT'
BUCKET='YOUR_BUCKET'
def run():
argv = [
'--project={0}'.format(PROJECT),
'--job_name=YOUR_JOB_NAME',
'--save_main_session',
'--staging_location=gs://{0}/staging/'.format(BUCKET),
'--temp_location=gs://{0}/staging/'.format(BUCKET),
'--region=us-central1',
'--runner=DataflowRunner'
]
p = beam.Pipeline(argv=argv)
table_schema = {'fields': [
{'name': 'country', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': '_time', 'type': 'DATETIME', 'mode': 'NULLABLE'},
{'name': 'query', 'type': 'STRING', 'mode': 'NULLABLE'}]}
additional_bq_parameters = {
'timePartitioning': {'type': 'DAY', 'field': '_time'}}
elements = (p | beam.Create([
{'country': 'mexico', '_time': '2020-06-10 22:19:26', 'query': 'acapulco'},
{'country': 'canada', '_time': '2020-12-11 15:42:32', 'query': 'influenza'},
]))
elements | beam.io.WriteToBigQuery(
table='YOUR_DATASET.YOUR_NEW_TABLE',
schema=table_schema,
additional_bq_parameters=additional_bq_parameters,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE
)
p.run()
if __name__ == '__main__':
run()

最新更新