获取 PDF 格式的结果表单 tessearct .js 时出现问题



我正在使用tesseract构建一个简单的javaScript OCR(光学内容识别(应用程序.js并且我正在将{tess_create_pdf:"1"}传递到.recognize((方法中以获取pdf格式的结果,但它不起作用。所以请有人让我知道我正在制造的问题。

const express  = require('express');
const app      = express();
const fs       = require('fs');
const multer   = require('multer');
const { createWorker } = require("tesseract.js");
const worker           = createWorker();
app.set("view engine", "ejs");
const storage = multer.diskStorage({
destination: (req, file, cb) => {
cb(null, './uploads');
},
filename: (req, file, cb) => {
cb(null, file.originalname);
}
})
const upload = multer({storage: storage}).single('avatar');
app.get('/', (req, res) => {
res.render('index');
});
app.post('/upload', (req, res) => {
upload(req, res, err => {
fs.readFile(`./uploads/${req.file.originalname}`, (err, data) => {
if(err) return console.log('this is your error', err);
(async ()=> {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(data, { tessjs_create_pdf: "1"});
res.send(text);
await worker.terminate();
})();
});
})
})
var port = 3000 || process.env.PORT;
app.listen(port, () => {
console.log("server has started!!!!");
})

你必须使用getPDF((函数来生成 PDF 文件。(在 tesseract.js 2.1.4( 在识别文本之后和终止工作人员之前添加以下代码。在fs中使用writeFileSync将文件写入磁盘。

const { data } = await worker.getPDF("Tesseract OCR Result");
fs.writeFileSync("tesseract-ocr-result.pdf", Buffer.from(data));

如果要下载生成的PDF文件,请使用以下命令将用户重定向到另一条路由...

res.redirect("/download")

。并将以下代码添加到您的路线中。文件将保存在根目录中,因此我们可以使用__dirname作为路径。

app.get("/download", (req, res) => {
const file = `${__dirname}/tesseract.js-ocr-result.pdf`;
res.download(file);
});

您的最终代码将如下所示。为了避免冲突,我在以下代码中将参数数据重命名为imgfs.readline...(第 3 行( 中。

app.post('/upload', (req, res) => {
upload(req, res, err => {
fs.readFile(`./uploads/${req.file.originalname}`, (err, img) => {
if(err) return console.log('this is your error', err);
(async ()=> {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.recognize(img);
const { data } = await worker.getPDF("Tesseract OCR Result");
fs.writeFileSync("tesseract-ocr-result.pdf", Buffer.from(data));
res.redirect("/download");
await worker.terminate();
})();
});
})
})
app.get("/download", (req, res) => {
const file = `${__dirname}/tesseract-ocr-result.pdf`;
res.download(file);
});

您可以使用pdf

工具包
const express = require('express');
const app = express();
const fs = require('fs');
const multer = require('multer');
const { createWorker } = require('tesseract.js');
const worker = createWorker({
logger: m => console.log(m)
});
const cors = require('cors');
const PDFDocument = require('pdfkit');
// Create a document
const doc = new PDFDocument();
// Pipe its output somewhere, like to a file or HTTP response
// See below for browser usage
doc.pipe(fs.createWriteStream('tesseract.js-ocr-result.pdf'));
app.use(cors());
var bodyParser = require('body-parser');
app.use(bodyParser.json({ limit: '50mb' }));
app.use(
bodyParser.urlencoded({
extended: true,
limit: '50mb',
parameterLimit: 1000000
})
);
var Storage = multer.diskStorage({
destination: (req, file, callback) => {
callback(null, __dirname + '/images');
},
filename: (req, file, callback) => {
callback(null, file.originalname);
}
});
var upload = multer({
storage: Storage
}).single('avatar');
app.post('/upload', (req, res) => {
upload(req, res, err => {
console.log('Request ---', req.body);
console.log('Request file ---', req.file);
fs.readFile(`./images/${req.file.originalname}`, (err, image) => {
if (err) {
console.log(err);
}
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const {
data: { text }
} = await worker.recognize(image);
doc.image(image, {
fit: [250, 300],
align: 'center',
valign: 'center'
});
doc
.addPage()
.fontSize(25)
.text(text);
doc.end();
await worker.terminate();
})();
});
});
});
app.get('/download', (req, res) => {
const file = `${__dirname}/tesseract.js-ocr-result.pdf`;
res.download(file);
});
app.listen(5000, () => {
console.log('server Started');
});

最新更新