使用fastcsv将s3流传输到dynawdb:并没有插入所有数据



当csv文件上传到我的s3 bucket时,我的lambda将被触发,将我的数据插入DynamoDB。我需要一个流,因为文件太大,无法作为完整对象下载。

const batchWrite = async (clientDynamoDB, itemsToProcess) => {
const ri = {};
ri[TABLE_DYNAMO] = itemsToProcess.map((itm) => toPutRequest(itm));
const params = { RequestItems: ri };
await clientDynamoDB.batchWriteItem(params).promise();
};
function runStreamPromiseAsync(stream, clientDynamoDB) {
return new Promise((resolve, reject) => {
const sizeChunk = 25;
let itemsToProcess = [];
stream
.pipe(fastCsv.parse({headers: Object.keys(schemaGeData), trim: true}))
.on("data", (row) => {
stream.pause();
itemsToProcess.push(row);
if (itemsToProcess.length === sizeChunk) {
batchWrite(clientDynamoDB, itemsToProcess).finally(() => {
stream.resume();
});
itemsToProcess = [];
}
})
.on("error", (err) => {
console.log(err);
reject("Error");
})
.on("end", () => {
stream.pause();
console.log("end");
batchWrite(clientDynamoDB, itemsToProcess).finally(() => {
resolve("OK");
});
});
});
}
module.exports.main = async (event, context, callback) => {
context.callbackWaitsForEmptyEventLoop = false;
const AWS = require('aws-sdk');
const s3 = new AWS.S3();
const object = event.Records[0].s3;
const bucket = object.bucket.name;
const file = object.object.key;
const agent = new https.Agent({
keepAlive: true
});
const client = new AWS.DynamoDB({
httpOptions: {
agent
}
});
try {
//get Stream csv data
const stream = s3
.getObject({
Bucket: bucket,
Key: file
})
.createReadStream()
.on('error', (e) => {
console.log(e);
});
await runStreamPromiseAsync(stream, client);
} catch (e) {
console.log(e);
}
};

当我的文件是1000行时,所有内容都被插入,但当我有5000行时,我的函数只插入大约3000行,这个数字是随机的。。。有时多有时少。。

所以我想知道我在这里错过了什么?

我也读过这篇文章,但老实说,即使你暂停了第二个流,第一个流仍然在运行。。因此,如果有人对如何做到这一点有任何想法,我们将不胜感激!

感谢

我发现了为什么它没有被完全处理,这是因为batchWriteItem的回调可以返回未处理的项。所以我稍微更改了函数batchWriterunPromiseStreamAsync,因为我可能没有从itemsToProcess处理所有项目。

无论如何,这里是完整的代码:

const batchWrite = (client, itemsToProcess) => {
const ri = {};
ri[TABLE_DYNAMO] = itemsToProcess.map((itm) => toPutRequest(itm));
const items = { RequestItems: ri };
const processItemsCallback = function(err, data) {
return new Promise((resolve, reject) => {
if(!data || data.length === 0){
return resolve();
}
if(err){
return reject(err);
}
let params = {};
params.RequestItems = data.UnprocessedItems;
return client.batchWriteItem(params, processItemsCallback);
});
};
return client.batchWriteItem(items, processItemsCallback );
};
function runStreamPromiseAsync(stream, clientDynamoDB) {
return new Promise((resolve, reject) => {
const sizeChunk = 25;
let itemsToProcess = [];
let arrayPromise = [];
stream
.pipe(fastCsv.parse({headers: Object.keys(schemaGeData), trim: true}))
.on("error", (err) => {
console.log(err);
reject("Error");
})
.on('data', data => {
itemsToProcess.push(data);
if(itemsToProcess.length === sizeChunk){
arrayPromise.push(batchWrite(clientDynamoDB, itemsToProcess));
itemsToProcess = [];
}
})
.on('end', () => {
if(itemsToProcess.length !== 0){
arrayPromise.push(batchWrite(clientDynamoDB, itemsToProcess));
}
resolve(Promise.all(arrayPromise).catch(e => {
reject(e)
}));
});
});
}

相关内容

最新更新