我必须在文本文档中匹配,单词的单词具有变节。给定一个word
令牌,我的正则是
var wordRegex = new RegExp("\b(" + word + ")\b", "g");
while ((match = wordRegex.exec(text)) !== null) {
if (match.index > (seen.get(token) || -1)) {
var wordStart = match.index;
var wordEnd = wordStart + token.length - 1;
item.characterOffsetBegin = wordStart;
item.characterOffsetEnd = wordEnd;
seen.set(token, wordEnd);
break;
}
}
这对ciao
,casa
等的普通单词都可以,但是当我在诸如però
,così
等文本中,它将无法工作。
const seen = new Map();
var text = "Ci son macchine nascoste e, però, nascoste male"
var tokens = text.split(/[^a-zA-Z0-9àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ]+/i)
tokens.forEach((token, tokenIndex) => {
var item = {
"index": (tokenIndex + 1),
"word": token
}
var escaped = token.replace(/[-[]{}()*+?.,\^$|#s]/g, "\$&");
var wordRegex = new RegExp("\b(" + escaped + ")\b", "g");
var match = null;
console.log(token, "---->", wordRegex)
while ((match = wordRegex.exec(text)) !== null) {
console.log("t---->", match.index)
if (match.index > (seen.get(token) || -1)) {
var wordStart = match.index;
var wordEnd = wordStart + token.length - 1;
item.characterOffsetBegin = wordStart;
item.characterOffsetEnd = wordEnd;
seen.set(token, wordEnd);
break;
}
}
})
您可以看到匹配的一些单词(例如macchine
或nascoste
)时如何获得match.index
,换句话说(例如però
),REGEX无法正常工作,match
变量为null
:
macchine ----> /b(macchine)b/g
----> 7
nascoste ----> /b(nascoste)b/g
----> 16
e, ----> /b(e,)b/g
però, ----> /b(però,)b/g
nascoste ----> /b(nascoste)b/g
----> 16
----> 34
如何编写一个支持变节的边界正则态度?
[update] 遵循评论中建议的方法,在应用Regex
之前,我已经使用了每个单词的 token
删除了变量,然后对整个text
进行了删除,例如:
var normalizedText = removeDiacritics(text);
// for each token...
var escaped = token.replace(/[-[]{}()*+?.,\^$|#s]/g, "\$&");
escaped = removeDiacritics(escaped);
var wordRegex = new RegExp("\b(" + escaped + ")\b", "g");
var match = null;
while ((match = wordRegex.exec( normalizedText )) !== null)
{
//...
这次,我将获得带有b
单词边界捕获的口音的单词。当然,这种方法不是最佳的,因为必须适用于每个令牌的removeDiacritics
,因此最好的解决方案是一次。
这是我们在评论中提出的解决方案,以映射文本中的索引中的索引:
function removeDiacritics(text) {
return _.deburr(text)
}
const seen = new Map();
var text = "Ci son macchine nascoste e, però, nascoste male"
var tokens = text.split(/[^a-zA-Z0-9àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ]+/i)
var normalizedText = removeDiacritics(text)
tokens.forEach((token, tokenIndex) => {
var item = {
"index": (tokenIndex + 1),
"word": removeDiacritics(token)
}
var escaped = token.replace(/[-[]{}()*+?.,\^$|#s]/g, "\$&");
escaped = removeDiacritics(escaped)
var wordRegex = new RegExp("\b(" + escaped + ")\b", "g");
var match = null;
console.log(token, "---->", wordRegex)
while ((match = wordRegex.exec(normalizedText)) !== null) {
console.log("t---->", match.index)
if (match.index > (seen.get(token) || -1)) {
var wordStart = match.index;
var wordEnd = wordStart + token.length - 1;
item.characterOffsetBegin = wordStart;
item.characterOffsetEnd = wordEnd;
seen.set(token, wordEnd);
break;
}
}
})
<script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.11/lodash.min.js"></script>