我想提取一些数据,如" email addresses "
..从PDF文件的表格中,并使用我提取的电子邮件地址发送电子邮件给那些人。
目前为止我在网上搜索到的结果:
-
我必须将PDF文件转换为Excel,以便轻松读取数据并根据需要使用它们。
-
我找到了一些免费的dll,如
itextsharp
或PDFsharp
。
但是我没有找到任何代码片段来帮助在c#中做到这一点。有解决办法吗?
你绝对不需要将PDF转换为Excel。首先,请确定您的PDF是否包含文本数据,还是扫描图像。如果它包含文本数据,那么你使用"一些免费的dll"是正确的。我推荐iTextSharp,因为它很流行,而且很容易使用。
现在是有争议的部分。如果你不需要可靠的解决方案,最简单的方法是将所有PDF读取为字符串,然后使用正则表达式检索电子邮件。以下是使用iTextSharp阅读PDF并提取电子邮件的示例(不完美):
public string PdfToString(string fileName)
{
var sb = new StringBuilder();
var reader = new PdfReader(fileName);
for (int page = 1; page <= reader.NumberOfPages; page++)
{
var strategy = new SimpleTextExtractionStrategy();
string text = PdfTextExtractor.GetTextFromPage(reader, page, strategy);
text = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text)));
sb.Append(text);
}
reader.Close();
return sb.ToString();
}
//adjust expression as needed
Regex emailRegex = new Regex("Email Address (?<email>.+?) Passport No");
public IEnumerable<string> ExtractEmails(string content)
{
var matches = emailRegex.Matches(content);
foreach (Match m in matches)
{
yield return m.Groups["email"].Value;
}
}
使用bytescout PDF Extractor SDK,我们可以将整个页面提取为csv,如下所示。
CSVExtractor extractor = new CSVExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
TableDetector tdetector = new TableDetector();
tdetector.RegistrationKey = "demo";
tdetector.RegistrationName = "demo";
// Load the document
extractor.LoadDocumentFromFile("C:\sample.pdf");
tdetector.LoadDocumentFromFile("C:\sample.pdf");
int pageCount = tdetector.GetPageCount();
for (int i = 1; i <= pageCount; i++)
{
int j = 1;
do
{
extractor.SetExtractionArea(tdetector.GetPageRect_Left(i),
tdetector.GetPageRect_Top(i),
tdetector.GetPageRect_Width(i),
tdetector.GetPageRect_Height(i)
);
// and finally save the table into CSV file
extractor.SavePageCSVToFile(i, "C:\page-" + i + "-table-" + j + ".csv");
j++;
} while (tdetector.FindNextTable()); // search next table
}
public void Convert(string fileNames) {
int pageCount = 0;
iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(fileNames);
pageCount = reader.NumberOfPages;
string ext = System.IO.Path.GetExtension(fileNames);
//string[] outfiles = new string[pageCount];
//Excel.Application app = new Excel.Application();
//app.Workbooks.Add("");
CSVExtractor extractor = new CSVExtractor();
//string outfilePDF1 = fileNames.Replace((System.IO.Path.GetFileName(fileNames)), (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_rez" + ".csv"));
string outfilePDFExcel1 = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
(System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_rez" + ".xls"));
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
string folderName = @"C:UsersDafinaDesktopPDF_EditProjectPDF_EditProjectPDFs";
string pathString = System.IO.Path.Combine(folderName, System.IO.Path.GetFileName(fileNames).Replace(".pdf", "")) + "-CSVs";
System.IO.Directory.CreateDirectory(pathString);
for (int i = 0; i < pageCount; i++)
{
string outfilePDF = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
(System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_" + (i + 1).ToString()) + ext);
extractor.LoadDocumentFromFile(outfilePDF);
//string outfile = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
// (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_" + (i + 1).ToString()) + ".csv");
string outfile = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
(System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "-CSVs\" + "Sheet_" + (i + 1).ToString()) + ".csv");
extractor.SaveCSVToFile(outfile);
}
Excel.Application xlApp = new Microsoft.Office.Interop.Excel.Application();
if (xlApp == null)
{
Console.WriteLine("Excel is not properly installed!!");
return;
}
Excel.Workbook xlWorkBook;
object misValue = System.Reflection.Missing.Value;
xlWorkBook = xlApp.Workbooks.Add(misValue);
string[] cvsFiles = Directory.GetFiles(pathString);
Array.Sort(cvsFiles, new AlphanumComparatorFast());
//string[] lista = new string[pageCount];
//for (int t = 0; t < pageCount; t++)
//{
// lista[t] = cvsFiles[t];
//}
//Array.Sort(lista, new AlphanumComparatorFast());
Microsoft.Office.Interop.Excel.Worksheet xlWorkSheet;
for (int i = 0; i < cvsFiles.Length; i++)
{
int sheet = i + 1;
xlWorkSheet = xlWorkBook.Sheets[sheet];
if (i < cvsFiles.Length - 1)
{
xlWorkBook.Worksheets.Add(Type.Missing, xlWorkSheet, Type.Missing, Type.Missing);
}
int sheetRow = 1;
Encoding objEncoding = Encoding.Default;
StreamReader readerd = new StreamReader(File.OpenRead(cvsFiles[i]));
int ColumLength = 0;
while (!readerd.EndOfStream)
{
string line = readerd.ReadLine();
Console.WriteLine(line);
try
{
string[] columns = line.Split((new char[] { '"' }));
for (int col = 0; col < columns.Length; col++)
{
if (ColumLength < columns.Length)
{
ColumLength = columns.Length;
}
if (col % 2 == 0)
{
}
else if (columns[col] == "")
{
}
else
{
xlWorkSheet.Cells[sheetRow, col + 1] = columns[col].Replace(""", "");
}
}
sheetRow++;
}
catch (Exception e)
{
string msg = e.Message;
}
}
int k = 1;
for (int s = 1; s <= ColumLength; s++)
{
xlWorkSheet.Columns[k].Delete();
k++;
}
releaseObject(xlWorkSheet);
readerd.Close();
}
xlWorkBook.SaveAs(outfilePDFExcel1, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal,
misValue, misValue, misValue, misValue, Microsoft.Office.Interop.Excel.XlSaveAsAccessMode.xlExclusive,
misValue, misValue, misValue, misValue, misValue);
xlWorkBook.Close(true, misValue, misValue);
xlApp.Quit();
releaseObject(xlWorkBook);
releaseObject(xlApp);
var dir = new DirectoryInfo(pathString);
dir.Attributes = dir.Attributes & ~FileAttributes.ReadOnly;
dir.Delete(true);
}
最好的代码是使用第三方dll
namespace ConsoleApp2
{
internal class Program
{
static void Main(string[] args)
{
string pathToPdf = @"D:abcabc.pdf";
string pathToExcel = Path.ChangeExtension(pathToPdf, ".xls");
SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();
f.ExcelOptions.ConvertNonTabularDataToSpreadsheet = false;
// 'true' = Preserve original page layout.
// 'false' = Place tables before text.
f.ExcelOptions.PreservePageLayout = true;
// The information includes the names for the culture, the writing system,
// the calendar used, the sort order of strings, and formatting for dates and numbers.
System.Globalization.CultureInfo ci = new System.Globalization.CultureInfo("en-US");
ci.NumberFormat.NumberDecimalSeparator = ",";
ci.NumberFormat.NumberGroupSeparator = ".";
f.ExcelOptions.CultureInfo = ci;
f.OpenPdf(pathToPdf);
if (f.PageCount > 0)
{
int result = f.ToExcel(pathToExcel);
// Open the resulted Excel workbook.
if (result == 0)
{
System.Diagnostics.Process.Start(pathToExcel);
}
}
}
}
}