JavaScript Regex将单词的边界与变音符号匹配



我必须在文本文档中匹配,单词的单词具有变节。给定一个word令牌,我的正则是

var wordRegex = new RegExp("\b(" + word + ")\b", "g");
while ((match = wordRegex.exec(text)) !== null) {
                            if (match.index > (seen.get(token) || -1)) {
                                var wordStart = match.index;
                                var wordEnd = wordStart + token.length - 1;
                                item.characterOffsetBegin = wordStart;
                                item.characterOffsetEnd = wordEnd;
                                seen.set(token, wordEnd);
                                break;
                            }
                        }

这对ciaocasa等的普通单词都可以,但是当我在诸如peròcosì等文本中,它将无法工作。

const seen = new Map();
var text = "Ci son macchine nascoste e, però, nascoste male"
var tokens = text.split(/[^a-zA-Z0-9àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ]+/i)
tokens.forEach((token, tokenIndex) => {
  var item = {
    "index": (tokenIndex + 1),
    "word": token
  }
  var escaped = token.replace(/[-[]{}()*+?.,\^$|#s]/g, "\$&");
  var wordRegex = new RegExp("\b(" + escaped + ")\b", "g");
  var match = null;
  console.log(token, "---->", wordRegex)
  while ((match = wordRegex.exec(text)) !== null) {
    console.log("t---->", match.index)
    if (match.index > (seen.get(token) || -1)) {
      var wordStart = match.index;
      var wordEnd = wordStart + token.length - 1;
      item.characterOffsetBegin = wordStart;
      item.characterOffsetEnd = wordEnd;
      seen.set(token, wordEnd);
      break;
    }
  }
})

您可以看到匹配的一些单词(例如macchinenascoste)时如何获得match.index,换句话说(例如però),REGEX无法正常工作,match变量为null

macchine ----> /b(macchine)b/g
    ----> 7
nascoste ----> /b(nascoste)b/g
    ----> 16
e, ----> /b(e,)b/g
però, ----> /b(però,)b/g
nascoste ----> /b(nascoste)b/g
    ----> 16
    ----> 34

如何编写一个支持变节的边界正则态度?

[update] 遵循评论中建议的方法,在应用Regex之前,我已经使用了每个单词的 token删除了变量,然后对整个text进行了删除,例如:

var normalizedText = removeDiacritics(text);
// for each token...
var escaped = token.replace(/[-[]{}()*+?.,\^$|#s]/g, "\$&");
escaped = removeDiacritics(escaped);
var wordRegex = new RegExp("\b(" + escaped + ")\b", "g");
var match = null;
while ((match = wordRegex.exec( normalizedText )) !== null) 
{
                             //...

这次,我将获得带有b单词边界捕获的口音的单词。当然,这种方法不是最佳的,因为必须适用于每个令牌的removeDiacritics,因此最好的解决方案是一次。

这是我们在评论中提出的解决方案,以映射文本中的索引中的索引:

function removeDiacritics(text) {
  return _.deburr(text)
}
const seen = new Map();
var text = "Ci son macchine nascoste e, però, nascoste male"
var tokens = text.split(/[^a-zA-Z0-9àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ]+/i)
  var normalizedText = removeDiacritics(text)
  
tokens.forEach((token, tokenIndex) => {
  var item = {
    "index": (tokenIndex + 1),
    "word": removeDiacritics(token)
  }
  var escaped = token.replace(/[-[]{}()*+?.,\^$|#s]/g, "\$&");
  escaped = removeDiacritics(escaped)
  var wordRegex = new RegExp("\b(" + escaped + ")\b", "g");
  var match = null;
  console.log(token, "---->", wordRegex)
  while ((match = wordRegex.exec(normalizedText)) !== null) {
    console.log("t---->", match.index)
    if (match.index > (seen.get(token) || -1)) {
      var wordStart = match.index;
      var wordEnd = wordStart + token.length - 1;
      item.characterOffsetBegin = wordStart;
      item.characterOffsetEnd = wordEnd;
      seen.set(token, wordEnd);
      break;
    }
  }
})
<script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.11/lodash.min.js"></script>

最新更新