我希望有人能帮助我改进Teradata TPT加载脚本。我使用下面的脚本将一个3GB分隔的CSV文件加载到Teradata中。该文件位于我的本地笔记本电脑硬盘上。加载该文件大约需要30分钟,这相当长。要加载的行总数约为3000万。关于绩效改进的任何建议。
DEFINE JOB LOAD_TD_FROM_CSV
DESCRIPTION 'Load Teradata table from CSV File'
(
DEFINE SCHEMA FOOD_TPT /*Define Table schema*/
DESCRIPTION 'FOOD_TPT'
(
col1 VARCHAR(20),
col2 VARCHAR(100),
col3 VARCHAR(100),
col4 VARCHAR(100),
col5 VARCHAR(100),
col6 VARCHAR(100),
col7 VARCHAR(100),
col8 VARCHAR(100)
);
DEFINE OPERATOR DDL_OPERATOR
TYPE DDL
ATTRIBUTES
(
VARCHAR TdpId = 'system', /*System Name*/
VARCHAR UserName = 'user', /*USERNAME*/
VARCHAR UserPassword = 'password', /*Password*/
VARCHAR Errorlist ='3807' /*This is added to skip 'Table does not exist error' and treat it as warnig*/
);
DEFINE OPERATOR LOAD_CSV /*Load information*/
DESCRIPTION 'Operator to Load CSV Data'
TYPE LOAD
SCHEMA *
ATTRIBUTES
(
VARCHAR PrivateLogName,
VARCHAR TraceLevel = 'None',
INTEGER TenacityHours = 1,
INTEGER TenacitySleep = 1,
INTEGER MaxSessions = 4,
INTEGER MinSessions = 1,
INTEGER BUFFERSIZE =16,
VARCHAR TargetTable = 'FOOD_TPT_STG', /*Define target table name where the file will be loaded*/
VARCHAR LogTable = 'FOOD_TPT_LOG', /*Define Log table name*/
VARCHAR ErrorTable1 = 'FOOD_TPT_STG_E1',/*There are 2 error tables. Define them. First table is _ET table*/
VARCHAR ErrorTable2 = 'FOOD_TPT_STG_E2', /*Define _UV table*/
VARCHAR TdpId = 'system', /*System Name*/
VARCHAR UserName = 'user', /*Username*/
VARCHAR UserPassword = 'password' /*Password*/
);
DEFINE OPERATOR READ_CSV
DESCRIPTION 'Operator to Read CSV File'
TYPE DATACONNECTOR PRODUCER
SCHEMA FOOD_TPT
ATTRIBUTES
(
VARCHAR Filename = 'file.csv' /*give file name with path*/
,VARCHAR Format = 'Delimited'
,VARCHAR TextDelimiter = ','
,VARCHAR AcceptExcessColumns = 'N'
,VARCHAR PrivateLogName = 'LOAD_FROM_CSV'
,Integer SkipRows=1 /*skips the header in csv file*/
);
Step Setup_Tables /*Enter all executable SQLs in this step*/
(
APPLY
('Drop table FOOD_TPT_STG_E1;'), /*Drop error tables*/
('Drop table FOOD_TPT_STG_E2;'),
('Drop table FOOD_TPT_LOG;'), /*Drop Log Table*/
('Drop table FOOD_TPT_STG;'), /*Drop Target staging tables*/
('CREATE TABLE FOOD_TPT_STG ,NO FALLBACK ,
NO BEFORE JOURNAL,
NO AFTER JOURNAL,
CHECKSUM = DEFAULT,
datablocksize= 1022 kbytes,
DEFAULT MERGEBLOCKRATIO
(
col1 VARCHAR(20),
col2 VARCHAR(100),
col3 VARCHAR(100),
col4 VARCHAR(100),
col5 VARCHAR(100),
col6 VARCHAR(100),
col7 VARCHAR(100),
col8 VARCHAR(100)
)
NO PRIMARY INDEX;') /*Create Target table*/
TO OPERATOR (DDL_OPERATOR);
);
Step Load_Table
(
APPLY ('INSERT INTO FOOD_RISK_TPT_STG
(
:col1
,:col2
,:col3
,:col4
,:col5
,:col6
,:col7
,:col8
);') /*Inserts records from CSV file into Target Table*/
TO OPERATOR (LOAD_CSV)
SELECT * FROM operator(READ_CSV);
);
);
提前感谢
正如Fred所写,您指定的BUFFERSIZE为16KB->每个块22行(FastLoad根据定义的最大大小计算(,这导致发送160万条消息。删除该属性,默认情况下会得到1MB,每个块1400行。此外,你可以简化你的脚本如下:
DEFINE JOB LOAD_TD_FROM_CSV
DESCRIPTION 'Load Teradata table from CSV File'
(
Step Setup_Tables /*Enter all executable SQLs in this step*/
(
APPLY
('Drop table FOOD_TPT_STG_E1;'), /*Drop error tables*/
('Drop table FOOD_TPT_STG_E2;'),
('Drop table FOOD_TPT_LOG;'), /*Drop Log Table*/
('Drop table FOOD_TPT_STG;'), /*Drop Target staging tables*/
('CREATE TABLE FOOD_TPT_STG ,NO FALLBACK ,
NO BEFORE JOURNAL,
NO AFTER JOURNAL,
CHECKSUM = DEFAULT,
datablocksize= 1022 kbytes,
DEFAULT MERGEBLOCKRATIO
(
col1 VARCHAR(20),
col2 VARCHAR(100),
col3 VARCHAR(100),
col4 VARCHAR(100),
col5 VARCHAR(100),
col6 VARCHAR(100),
col7 VARCHAR(100),
col8 VARCHAR(100)
)
NO PRIMARY INDEX;') /*Create Target table*/
TO OPERATOR ($DDL
(
TdpId = 'system', /*System Name*/
UserName = 'user', /*USERNAME*/
UserPassword = 'password', /*Password*/
Errorlist ='3807' /*This is added to skip 'Table does not exist error' and treat it as warnig*/
)
);
Step Load_Table
(
APPLY ($INSERT 'FOOD_RISK_TPT_STG') /*Inserts records from CSV file into Target Table*/
TO OPERATOR ($LOAD(
/* BUFFERSIZE = 16384, Default is 1 MB, increasing it further to the max 16MB might improve a bit */
TargetTable = 'FOOD_TPT_STG', /*Define target table name where the file will be loaded*/
LogTable = 'FOOD_TPT_LOG', /*Define Log table name*/
ErrorTable1 = 'FOOD_TPT_STG_E1',/*There are 2 error tables. Define them. First table is _ET table*/
ErrorTable2 = 'FOOD_TPT_STG_E2', /*Define _UV table*/
TdpId = 'system', /*System Name*/
UserName = 'user', /*Username*/
UserPassword = 'password' /*Password*/
)
)
SELECT * FROM operator($FILE_READER
(
Filename = 'file.csv' /*give file name with path*/
,Format = 'Delimited'
,TextDelimiter = ','
,AcceptExcessColumns = 'N'
,PrivateLogName = 'LOAD_FROM_CSV'
,SkipRows=1 /*skips the header in csv file*/
));
);
还有一些作业变量文件可以使脚本更好地可重用