如何使用JavaScript计算不同术语在字符串中出现的次数



我有一个20个短语的列表,我想在一篇文章中计算这些短语。

目前,我正在做


let counts = phrases.map((phrase, idx) => {
phrase.usage = (articleBody.match(new RegExp(phrase.phrase, 'gi')) || []).length
return phrase
})

这要求我每次都要进行昂贵的搜索。有没有什么方法可以更快或一次完成?

您可以使用String.prototype.matchAll

const str = ['ocurrence 1, and..', 'ocurrence 2, and..', 'other phrases'];
const array = [...str.join ` `.matchAll(new RegExp('ocurrence', 'gi'))];
console.log(array.length); // 2

编辑:在一个字符串中计数出现次数;

const articleBody = 'ocurrence 1, and.. ocurrence 2, and... etc';
const array = [...articleBody.matchAll(new RegExp('ocurrence', 'gi'))];
console.log(array.length) // 2;

编辑2:

使用服务人员

我创建了一个3000字的文本("Lorem ipsum"生成器(。

main.js

if ("serviceWorker" in navigator) {
document.addEventListener("DOMContentLoaded", function () {
const phrases = [
{ text: "Lorem ipsum" },
{ text: "elementum mattis" },
];
phrases.map((phrase, index) => {
const phraseWorker = new Worker("phrase-match-sw.js");
phraseWorker.onmessage = function (oEvent) {
const { text, match } = oEvent.data;
phrases[index] = { text, match };
};
phraseWorker.postMessage({
article:
"Lorem ipsum dolor sit amet consectetur adipiscing elit sapien orci, ligula leo consequat mus tellus elementum mattis lacus maecenas curabitur, ",
phrase,
});
});
});
}

phrase-match-sw.js

onmessage = function (event) {
const { article, phrase } = event.data;
phrase.match = [...article.matchAll(new RegExp(phrase.text,"gi"))].length;
postMessage(phrase);
};

请看一下这个解决方案。

function occurrences(string, subString, allowOverlapping) {
string += "";
subString += "";
if (subString.length <= 0) return (string.length + 1);
var n = 0,
pos = 0,
step = allowOverlapping ? 1 : subString.length;
while (true) {
pos = string.indexOf(subString, pos);
if (pos >= 0) {
++n;
pos += step;
} else break;
}
return n;
}
console.time('occurrences');
occurrences("foofoofoo", "bar"); //0
occurrences("foofoofoo", "foo"); //3
occurrences("foofoofoo", "foofoo"); //1
console.timeEnd('occurrences');
console.time('RegExp');
("foofoofoo".match(new RegExp("bar", 'gi')) || []).length; // 0
("foofoofoo".match(new RegExp("foo", 'gi')) || []).length; // 3
("foofoofoo".match(new RegExp("foofoo", 'gi')) || []).length; // 1
console.timeEnd('RegExp');

这是基准结果:https://jsben.ch/aIqEP

我在一篇50KB的文章中试用了它,并根据您的使用情况进行了基准测试。结果显示,下面的解决方案比您的解决方案快%40。

基准链接https://jsben.ch/wWW6e

我减小了文章的大小以保存它,你可以增加它以获得更好的结果。

let phrases = [{ phrase: 'placerat', usage: 0 }, { phrase: 'lorem', usage: 0 }];
let matching = article.match(new RegExp(phrases.map(phrase => phrase.phrase).join('|'), 'gi')).reduce((acc, match) => {
match = match.toLowerCase();
if (acc[match]) {
acc[match].count++;
} else {
acc[match] = { count: 0 };
}      
return acc;
}, {});

您可以在.match()调用中使用|运算符来匹配多个短语。developer.mozilla.org上有一个匹配fox或cat的例子,但他们没有详细说明计算结果。

但由于match()返回结果,只需构建一个哈希来计算每次匹配的次数。。。

// set data we'll be using
var str = "hello 1 hello 2 hello222notarealwordmatch hey 3";       // text string to search
var matchwords = ['hello', 'hey'];    // look for the phrases "hello" or "hey"
// build regex
var fullmatch = '\b' + matchwords.join('\b|\b') + '\b';
var match = new RegExp(fullmatch, "gi");     // set "g" for greedy, to search the whole string; "i" for ignore-case, to be case insensitive
// build match results
var matches = {};
var results = str.match(match);
if(results) {
results.forEach((thing) => {
matches[thing] = matches[thing] ? matches[thing] + 1 : 1;
});
}
// display results
console.log(matches);

在您的情况下,为了完全匹配整个单词,您还需要在单词边界上进行匹配,例如/bcodeb|bdatabaseb/将匹配codedatabase,但不匹配databaselikecodeesque。在我上面的例子中,你会看到我们正在搜索";你好";,但是";hello222notrealwordmatch";正确地在计数中被忽略。如果你不在乎单词边界,那么只需在定义fullmatch时从上面删除\b的4个实例。。。

var fullmatch = matchwords.join('|');

最新更新