如何在c#中转换pdf文件到excel



我想提取一些数据,如" email addresses " ..从PDF文件的表格中,并使用我提取的电子邮件地址发送电子邮件给那些人。

目前为止我在网上搜索到的结果:

  1. 我必须将PDF文件转换为Excel,以便轻松读取数据并根据需要使用它们。

  2. 我找到了一些免费的dll,如itextsharpPDFsharp

但是我没有找到任何代码片段来帮助在c#中做到这一点。有解决办法吗?

你绝对不需要将PDF转换为Excel。首先,请确定您的PDF是否包含文本数据,还是扫描图像。如果它包含文本数据,那么你使用"一些免费的dll"是正确的。我推荐iTextSharp,因为它很流行,而且很容易使用。

现在是有争议的部分。如果你不需要可靠的解决方案,最简单的方法是将所有PDF读取为字符串,然后使用正则表达式检索电子邮件。以下是使用iTextSharp阅读PDF并提取电子邮件的示例(不完美):

public string PdfToString(string fileName)
{
    var sb = new StringBuilder();    
    var reader = new PdfReader(fileName);
    for (int page = 1; page <= reader.NumberOfPages; page++)
    {
        var strategy = new SimpleTextExtractionStrategy();
        string text = PdfTextExtractor.GetTextFromPage(reader, page, strategy);
        text = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text)));
        sb.Append(text);
    }
    reader.Close();        
    return sb.ToString();
}
//adjust expression as needed
Regex emailRegex = new Regex("Email Address (?<email>.+?) Passport No");
public IEnumerable<string> ExtractEmails(string content)
{   
    var matches = emailRegex.Matches(content);
    foreach (Match m in matches)
    {
        yield return m.Groups["email"].Value;
    }
}

使用bytescout PDF Extractor SDK,我们可以将整个页面提取为csv,如下所示。

CSVExtractor extractor = new CSVExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
TableDetector tdetector = new TableDetector();
tdetector.RegistrationKey = "demo";
tdetector.RegistrationName = "demo";
// Load the document
extractor.LoadDocumentFromFile("C:\sample.pdf");
tdetector.LoadDocumentFromFile("C:\sample.pdf");
int pageCount = tdetector.GetPageCount();
for (int i = 1; i <= pageCount; i++)
{
    int j = 1;
        do
        {
                extractor.SetExtractionArea(tdetector.GetPageRect_Left(i),
                tdetector.GetPageRect_Top(i),
                tdetector.GetPageRect_Width(i),
                tdetector.GetPageRect_Height(i)
            );
            // and finally save the table into CSV file
            extractor.SavePageCSVToFile(i, "C:\page-" + i + "-table-" + j + ".csv");
            j++;
        } while (tdetector.FindNextTable()); // search next table
}
public void Convert(string fileNames) {
    int pageCount = 0;
    iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(fileNames);
    pageCount = reader.NumberOfPages;
    string ext = System.IO.Path.GetExtension(fileNames);
    //string[] outfiles = new string[pageCount];
    //Excel.Application app = new Excel.Application();
    //app.Workbooks.Add("");
    CSVExtractor extractor = new CSVExtractor();
    //string outfilePDF1 = fileNames.Replace((System.IO.Path.GetFileName(fileNames)), (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_rez" + ".csv"));
    string outfilePDFExcel1 = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
        (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_rez" + ".xls"));
    extractor.RegistrationName = "demo";
    extractor.RegistrationKey = "demo";
    string folderName = @"C:UsersDafinaDesktopPDF_EditProjectPDF_EditProjectPDFs";
    string pathString = System.IO.Path.Combine(folderName, System.IO.Path.GetFileName(fileNames).Replace(".pdf", "")) + "-CSVs";
    System.IO.Directory.CreateDirectory(pathString);
    for (int i = 0; i < pageCount; i++)
    {
        string outfilePDF = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
            (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_" + (i + 1).ToString()) + ext);
        extractor.LoadDocumentFromFile(outfilePDF);
        //string outfile = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
        //    (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "_" + (i + 1).ToString()) + ".csv");
        string outfile = fileNames.Replace((System.IO.Path.GetFileName(fileNames)),
            (System.IO.Path.GetFileName(fileNames).Replace(".pdf", "") + "-CSVs\" + "Sheet_" + (i + 1).ToString()) + ".csv");
        extractor.SaveCSVToFile(outfile);
    }
    Excel.Application xlApp = new Microsoft.Office.Interop.Excel.Application();
    if (xlApp == null)
    {
        Console.WriteLine("Excel is not properly installed!!");
        return;
    }
    Excel.Workbook xlWorkBook;

    object misValue = System.Reflection.Missing.Value;
    xlWorkBook = xlApp.Workbooks.Add(misValue);
    string[] cvsFiles = Directory.GetFiles(pathString);
    Array.Sort(cvsFiles, new AlphanumComparatorFast());
    //string[] lista = new string[pageCount];
    //for (int t = 0; t < pageCount; t++)
    //{
    //    lista[t] = cvsFiles[t];           
    //}
    //Array.Sort(lista, new AlphanumComparatorFast());

    Microsoft.Office.Interop.Excel.Worksheet xlWorkSheet;
    for (int i = 0; i < cvsFiles.Length; i++)
    {
        int sheet = i + 1;
        xlWorkSheet = xlWorkBook.Sheets[sheet];
        if (i < cvsFiles.Length - 1)
        {
            xlWorkBook.Worksheets.Add(Type.Missing, xlWorkSheet, Type.Missing, Type.Missing);
        }

        int sheetRow = 1;
        Encoding objEncoding = Encoding.Default;
        StreamReader readerd = new StreamReader(File.OpenRead(cvsFiles[i]));
        int ColumLength = 0;
        while (!readerd.EndOfStream)
        {
            string line = readerd.ReadLine();
            Console.WriteLine(line);
            try
            {
                string[] columns = line.Split((new char[] { '"' }));
                for (int col = 0; col < columns.Length; col++)
                {
                    if (ColumLength < columns.Length)
                    {
                        ColumLength = columns.Length;
                    }
                    if (col % 2 == 0)
                    {
                    }
                    else if (columns[col] == "")
                    {
                    }
                    else
                    {
                        xlWorkSheet.Cells[sheetRow, col + 1] = columns[col].Replace(""", "");
                    }
                }
                sheetRow++;
            }
            catch (Exception e)
            {
                string msg = e.Message;
            }
        }
        int k = 1;
        for (int s = 1; s <= ColumLength; s++)
        {
            xlWorkSheet.Columns[k].Delete();
            k++;
        }

        releaseObject(xlWorkSheet);
        readerd.Close();
    }
    xlWorkBook.SaveAs(outfilePDFExcel1, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal,
        misValue, misValue, misValue, misValue, Microsoft.Office.Interop.Excel.XlSaveAsAccessMode.xlExclusive,
        misValue, misValue, misValue, misValue, misValue);
    xlWorkBook.Close(true, misValue, misValue);
    xlApp.Quit();
    releaseObject(xlWorkBook);
    releaseObject(xlApp);
    var dir = new DirectoryInfo(pathString);
    dir.Attributes = dir.Attributes & ~FileAttributes.ReadOnly;
    dir.Delete(true);
}

最好的代码是使用第三方dll

namespace ConsoleApp2
{
    internal class Program
    {
        static void Main(string[] args)
        {
            string pathToPdf = @"D:abcabc.pdf";
            string pathToExcel = Path.ChangeExtension(pathToPdf, ".xls");
            
            SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();
   
            f.ExcelOptions.ConvertNonTabularDataToSpreadsheet = false;
            // 'true'  = Preserve original page layout.
            // 'false' = Place tables before text.
            f.ExcelOptions.PreservePageLayout = true;
            // The information includes the names for the culture, the writing system,
            // the calendar used, the sort order of strings, and formatting for dates and numbers.
            System.Globalization.CultureInfo ci = new System.Globalization.CultureInfo("en-US");
            ci.NumberFormat.NumberDecimalSeparator = ",";
            ci.NumberFormat.NumberGroupSeparator = ".";
            f.ExcelOptions.CultureInfo = ci;
            f.OpenPdf(pathToPdf);
            if (f.PageCount > 0)
            {
                int result = f.ToExcel(pathToExcel);
                // Open the resulted Excel workbook.
                if (result == 0)
                {
                    System.Diagnostics.Process.Start(pathToExcel);
                }
            }

        }
    }
}

最新更新