使xpdf Pdf2Txt作为线程安全功能



我曾尝试在MFC应用程序中使用xpdf源代码将pdf转换为文本。代码示例取自他们的站点(或存储库(:

int Pdf2Txt(std::string PdfFile, std::string TxtFile) const
{
GString* ownerPW, *userPW;
UnicodeMap* uMap;
TextOutputDev* textOut;
TextOutputControl textOutControl;
GString* textFileName;
int exitCode;
char textEncName[128] = "";
char textEOL[16] = "";
GBool noPageBreaks = gFalse;
GBool quiet = gFalse;
char ownerPassword[33] = "01";
char userPassword[33] = "01";
int firstPage = 1;
int lastPage = 0;
GBool tableLayout = gFalse;
double fixedPitch = 0;
GBool physLayout = gFalse;
GBool simpleLayout = gFalse;
GBool simple2Layout = gFalse;
GBool linePrinter = gFalse;
GBool rawOrder = gFalse;
double fixedLineSpacing = 0;
double marginLeft = 0;
double marginRight = 0;
double marginTop = 0;
double marginBottom = 0;
GBool clipText = gFalse;
GBool discardDiag = gFalse;
GBool insertBOM = gFalse;
exitCode = 99;
// read config file
globalParams = new GlobalParams("");
if (textEncName[0])
{
globalParams->setTextEncoding(textEncName);
}
if (textEOL[0])
{
if (!globalParams->setTextEOL(textEOL))
{
fprintf(stderr, "Bad '-eol' value on command linen");
}
}
if (noPageBreaks)
{
globalParams->setTextPageBreaks(gFalse);
}
if (quiet)
{
globalParams->setErrQuiet(quiet);
}
// Set UNICODE support
globalParams->setTextEncoding("UTF-8");
// get mapping to output encoding
if (!(uMap = globalParams->getTextEncoding()))
{
error(errConfig, -1, "Couldn't get text encoding");
goto err1;
}
// open PDF file
if (ownerPassword[0] != '01')
{
ownerPW = new GString(ownerPassword);
}
else
{
ownerPW = NULL;
}
if (userPassword[0] != '01')
{
userPW = new GString(userPassword);
}
else
{
userPW = NULL;
}
PDFDoc* doc = new PDFDoc((char*)PdfFile.c_str(), ownerPW, userPW);
if (userPW)
{
delete userPW;
}
if (ownerPW)
{
delete ownerPW;
}
if (! doc->isOk())
{
exitCode = 1;
goto err2;
}
// check for copy permission
if (! doc->okToCopy())
{
error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
exitCode = 3;
goto err2;
}
// construct text file name
textFileName = new GString(TxtFile.c_str());
// get page range
if (firstPage < 1)
{
firstPage = 1;
}
if (lastPage < 1 || lastPage > doc->getNumPages())
{
lastPage = doc->getNumPages();
}
// write text file
if (tableLayout)
{
textOutControl.mode = textOutTableLayout;
textOutControl.fixedPitch = fixedPitch;
}
else if (physLayout)
{
textOutControl.mode = textOutPhysLayout;
textOutControl.fixedPitch = fixedPitch;
}
else if (simpleLayout)
{
textOutControl.mode = textOutSimpleLayout;
}
else if (simple2Layout)
{
textOutControl.mode = textOutSimple2Layout;
}
else if (linePrinter)
{
textOutControl.mode = textOutLinePrinter;
textOutControl.fixedPitch = fixedPitch;
textOutControl.fixedLineSpacing = fixedLineSpacing;
}
else if (rawOrder)
{
textOutControl.mode = textOutRawOrder;
}
else
{
textOutControl.mode = textOutReadingOrder;
}
textOutControl.clipText = clipText;
textOutControl.discardDiagonalText = discardDiag;
textOutControl.insertBOM = insertBOM;
textOutControl.marginLeft = marginLeft;
textOutControl.marginRight = marginRight;
textOutControl.marginTop = marginTop;
textOutControl.marginBottom = marginBottom;
textOut = new TextOutputDev(textFileName->getCString(), &textOutControl, gFalse, gTrue);
if (textOut->isOk())
{
doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0, gFalse, gTrue, gFalse);
}
else
{
delete textOut;
exitCode = 2;
goto err3;
}
delete textOut;
exitCode = 0;
// clean up
err3:
delete textFileName;
err2:
delete doc;
//  uMap->decRefCnt();
err1:
delete globalParams;
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return exitCode;
}

到目前为止,一切都很好。但这段代码不是线程安全的:如果我试图在多线程代码中运行这段代码,它就会崩溃:

// TextOutputDev.cc
if (uMap->isUnicode())
{
lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); // <-- crash

为什么?因为有一个变量globalParams,它在函数的最后一行被删除,并且它对所有线程都很常见:

delete globalParams;

globalParams它是globalParams.h的外部全局变量(xpdf代码的一部分(:

// xpdf/GlobalParams.h
// The global parameters object.
extern GlobalParams *globalParams;

如何实现此函数的线程安全?因为";问题变量";它在xpdf源代码中,而不是在我的。。。

附言:总而言之,globalParams它在xpdf代码中声明,并在我的(客户端(代码中清除。

这里可以看到xpdf源代码:https://github.com/jeroen/xpdf/blob/c2c946f517eb09cfd09d957e0f3b04d44bf6f827/src/poppler/GlobalParams.h

https://github.com/jeroen/xpdf/blob/c2c946f517eb09cfd09d957e0f3b04d44bf6f827/src/poppler/GlobalParams.cc

尝试重新构建代码,如下所示。我已经将GlobalParams初始化代码移到了一个单独的函数中。此函数应在初始化期间或启动调用Pdf2Txt()的线程之前调用(一次(。当然,GlobalParams实例不应该被销毁,因为它可以被多个线程使用。保持它的内存不会对你的应用程序造成伤害,它只是一个对象,并不是很大——好吧,它包含许多intbool成员变量,但这些变量不会占用太多空间,还有相当多的string*变量(我想最初是null或emtpy(,所以它最多只有几个KB。

bool InitGlobalParams()
{
UnicodeMap* uMap;
char textEncName[128] = "";
char textEOL[16] = "";
GBool noPageBreaks = gFalse;
GBool quiet = gFalse;
// read config file
globalParams = new GlobalParams(""); // <-- Maybe add some checking code here?
if (textEncName[0])
{
globalParams->setTextEncoding(textEncName);
}
if (textEOL[0])
{
if (!globalParams->setTextEOL(textEOL))
{
fprintf(stderr, "Bad '-eol' value on command linen");
}
}
if (noPageBreaks)
{
globalParams->setTextPageBreaks(gFalse);
}
if (quiet)
{
globalParams->setErrQuiet(quiet);
}
// Set UNICODE support
globalParams->setTextEncoding("UTF-8");
// get mapping to output encoding
if (!(uMap = globalParams->getTextEncoding()))
{
error(errConfig, -1, "Couldn't get text encoding");
return false;
}
return true;
}
int Pdf2Txt(std::string PdfFile, std::string TxtFile) const
{
GString* ownerPW, *userPW;
TextOutputDev* textOut;
TextOutputControl textOutControl;
GString* textFileName;
int exitCode;
char ownerPassword[33] = "01";
char userPassword[33] = "01";
int firstPage = 1;
int lastPage = 0;
GBool tableLayout = gFalse;
double fixedPitch = 0;
GBool physLayout = gFalse;
GBool simpleLayout = gFalse;
GBool simple2Layout = gFalse;
GBool linePrinter = gFalse;
GBool rawOrder = gFalse;
double fixedLineSpacing = 0;
double marginLeft = 0;
double marginRight = 0;
double marginTop = 0;
double marginBottom = 0;
GBool clipText = gFalse;
GBool discardDiag = gFalse;
GBool insertBOM = gFalse;
exitCode = 99;
// open PDF file
if (ownerPassword[0] != '01')
{
ownerPW = new GString(ownerPassword);
}
else
{
ownerPW = NULL;
}
if (userPassword[0] != '01')
{
userPW = new GString(userPassword);
}
else
{
userPW = NULL;
}
PDFDoc* doc = new PDFDoc((char*)PdfFile.c_str(), ownerPW, userPW);
if (userPW)
{
delete userPW;
}
if (ownerPW)
{
delete ownerPW;
}
if (! doc->isOk())
{
exitCode = 1;
goto err2;
}
// check for copy permission
if (! doc->okToCopy())
{
error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
exitCode = 3;
goto err2;
}
// construct text file name
textFileName = new GString(TxtFile.c_str());
// get page range
if (firstPage < 1)
{
firstPage = 1;
}
if (lastPage < 1 || lastPage > doc->getNumPages())
{
lastPage = doc->getNumPages();
}
// write text file
if (tableLayout)
{
textOutControl.mode = textOutTableLayout;
textOutControl.fixedPitch = fixedPitch;
}
else if (physLayout)
{
textOutControl.mode = textOutPhysLayout;
textOutControl.fixedPitch = fixedPitch;
}
else if (simpleLayout)
{
textOutControl.mode = textOutSimpleLayout;
}
else if (simple2Layout)
{
textOutControl.mode = textOutSimple2Layout;
}
else if (linePrinter)
{
textOutControl.mode = textOutLinePrinter;
textOutControl.fixedPitch = fixedPitch;
textOutControl.fixedLineSpacing = fixedLineSpacing;
}
else if (rawOrder)
{
textOutControl.mode = textOutRawOrder;
}
else
{
textOutControl.mode = textOutReadingOrder;
}
textOutControl.clipText = clipText;
textOutControl.discardDiagonalText = discardDiag;
textOutControl.insertBOM = insertBOM;
textOutControl.marginLeft = marginLeft;
textOutControl.marginRight = marginRight;
textOutControl.marginTop = marginTop;
textOutControl.marginBottom = marginBottom;
textOut = new TextOutputDev(textFileName->getCString(), &textOutControl, gFalse, gTrue);
if (textOut->isOk())
{
doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0, gFalse, gTrue, gFalse);
}
else
{
delete textOut;
exitCode = 2;
goto err3;
}
delete textOut;
exitCode = 0;
// clean up
err3:
delete textFileName;
err2:
delete doc;
//  uMap->decRefCnt();
err1:
// Do NOT delete the one and only GlobalParams instance!!!
//delete globalParams;
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return exitCode;
}

上面的代码甚至可能无法编译(我用文本编辑器修改了它,并没有真正测试它(,所以请做出任何可能需要的更改。很有可能xpdf函数不会修改globalParams对象(对它们来说是"只读的"(,所以这段代码很有可能工作。顺便说一句,GlobalParams类定义(GlobalParams.h(中有一个#if MULTITHREADED指令,它的块中包含3个互斥对象。实现代码(GlobalParams.cc(锁定了一个互斥锁来访问GlobalParams成员,因此这可能会导致一些线程等待一段时间,尽管我不知道要等待多少时间(必须彻底检查代码,这本身就是一个小的"项目"(。你可以试着测试一下。

当然,@KJ上面表达的担忧仍然适用,并行运行许多这样的线程可能会使系统过载(尽管我不确定xpdf是否使用多个线程来处理单个PDF,有人能帮忙吗,它是如何配置的,它可能会导致其他进程减慢。它还可能导致I/O瓶颈(磁盘和/或网络(,因此最初尝试使用较少的线程,并检查其扩展情况。

最新更新