将每个单独单词的坐标提取到 pdf 文件中的文本块中

按照这个实际的解决方案，我试图获取TextChunk中的所有单词及其每个坐标(actual page，top，bottom，left，right)。

由于TextChunk可以是短语，单词或其他任何东西，因此我尝试手动执行此操作，计算最后一个单词的矩形并每次都将其剪切。我注意到这种手动方法可能非常有问题(我需要手动依靠特殊字符等)，所以我问自己 ITextSharp 是否提供了更简单的方法来执行此操作。

我的Chunk和LocationTextExtractionStragy继承类如下：

public class Chunk
{
public Guid Id { get; set; }
public Rectangle Rect { get; set; }
public TextRenderInfo Render { get; set; }
public BaseFont BF { get; set; }
public string Text { get; set; }
public int FontSize { get; set; }

public Chunk(Rectangle rect, TextRenderInfo renderInfo)
{
this.Rect = rect;
this.Render = renderInfo;
this.Text = Render.GetText();
Initialize();
}

public Chunk(Rectangle rect, TextRenderInfo renderInfo, string text)
{
this.Rect = rect;
this.Render = renderInfo;
this.Text = text;
Initialize();
}

private void Initialize()
{
this.Id = Guid.NewGuid();
this.BF = Render.GetFont();
this.FontSize = ObtainFontSize();
}
private int ObtainFontSize()
{
return Convert.ToInt32(this.Render.GetSingleSpaceWidth() * 12 / this.BF.GetWidthPoint(" ", 12));
}
}
public class LocationTextExtractionPersonalizada : LocationTextExtractionStrategy
{
//Save each coordinate
public List<Chunk> ChunksInPage = new List<Chunk>();

//Automatically called on each chunk on PDF
public override void RenderText(TextRenderInfo renderInfo)
{
base.RenderText(renderInfo);
if (string.IsNullOrWhiteSpace(renderInfo.GetText())
|| renderInfo == null)
return;
//Get chunk Vectors
var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
var topRight = renderInfo.GetAscentLine().GetEndPoint();
//Create Rectangle based on previous Vectors
var rect = new Rectangle(
bottomLeft[Vector.I1],
bottomLeft[Vector.I2],
topRight[Vector.I1],
topRight[Vector.I2]);
if (rect == null)
return;
//Add each chunk with its coordinates
ChunksInPage.Add(new Chunk(rect, renderInfo));
}
}

因此，一旦我获得文件等，我就会以这种方式进行：

private void ProcessContent()
{
for (int page= 1; page <= pdfReader.NumberOfPages; page++)
{
var strategy = new LocationTextExtractionPersonalizada();
var currentPageText = PdfTextExtractor.GetTextFromPage(
pdfReader,
page,
strategy);

//Here is where I want to get each word with its coordinates
var chunksWords= ChunkRawToWord(strategy.ChunksInPage);
}
}
private List<Chunk> ChunkRawToWord(IList<Chunk> chunks)
{
if (chunks == null || chunks[0] == null)
return null;
var words = new List<Chunk>();
//Poor RegEx pattern to get the word and its wathever
string pattern = @"[@&w+]*(-*/*s*:*;*,*.*(*)*%*>*<*)?";
var something = chunks[0].Render.GetCharacterRenderInfos();
for (int i = 0; i < chunks.Count; i++)
{
var wordsInChunk = Regex.Matches(
chunks[i].Text,
pattern,
RegexOptions.IgnoreCase);

var rectangleChunk = new Rectangle(chunks[i].Rect);
for (int j = 0; j < wordsInChunk.Count; j++)
{
if (string.IsNullOrWhiteSpace(wordsInChunk[j].Value))
continue;
var word = new Chunk(
rectangleChunk, 
chunks[i].Render, 
wordsInChunk[j].ToString());

if (j == 0)
{
word.Rect.Right = word.BF.GetWidthPoint(word.Text, word.FontSize);
words.Add(word);
continue;
}
if (words.Count <= 0)
continue;
word.Rect.Left = words[j - 1].Rect.Right;
word.Rect.Right = words[j - 1].Rect.Right + word.BF.GetWidthPoint(word.Text, word.FontSize);
words.Add(word);
}
}
return words;
}

之后，我写了一篇关于 Mkl 解决方案的评论，回复是"use getCharacterRenderInfos()"，我使用了它，我将每个字符都放入 TextRenderInfo's 列表中。

很抱歉，我开始混合概念，找出如何应用该解决方案的方法，并让我大吃一惊。

我真的很感激在这里帮忙。

您可以使用方法TextRenderInfo.GetCharacterRenderInfos()获取块中每个字符的TextRenderInfo集合。然后，您可以将各个字符重新组合成单词，并使用单词中第一个和最后一个TextRenderInfo的坐标计算包含该单词的矩形。

在自定义文本提取策略中：

var _separators = new[] { "-", "(", ")", "/", " ", ":", ";", ",", "."};
protected virtual void ParseRenderInfo(TextRenderInfo currentInfo)
{
var resultInfo = new List<TextRenderInfo>();
var chars = currentInfo.GetCharacterRenderInfos();
foreach (var charRenderInfo in chars)
{
resultInfo.Add(charRenderInfo);
var currentChar = charRenderInfo.GetText();
if (_separators.Contains(currentChar))
{
ProcessWord(currentInfo, resultInfo);
resultInfo.Clear();
}
}
ProcessWord(currentInfo, resultInfo);
}
private void ProcessWord(TextRenderInfo charChunk, List<TextRenderInfo> wordChunks)
{
var firstRender = wordChunks.FirstOrDefault();
var lastRender = wordChunks.LastOrDefault();
if (firstRender == null || lastRender == null)
{
return;
}
var startCoords = firstRender.GetDescentLine().GetStartPoint();
var endCoords = lastRender.GetAscentLine().GetEndPoint();
var wordText = string.Join("", wordChunks.Select(x => x.GetText()));
var wordLocation = new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(startCoords, endCoords, charChunk.GetSingleSpaceWidth());
_chunks.Add(new CustomTextChunk(wordText, wordLocation));
}

相关内容

最新更新

热门标签：