有效地查找字符串是否包含单词列表中的匹配项



我有一组类似的单词

var words = ["champ", "winner", "world"];
var string = "I am a champ"; should match 
var string2 = "I am achamp";  should match
var string3 = "I am a win ner";  should match winner [or for efficency we can prevent match too, but only last resort]
var string4 = "I am not able to figure this"; no 

// I want to be able to match the words in string, string1, string2 with the words in the  // list of array efficently as the word array will be huge in size.This is what i have //  // conjured. 

var x = string.split(' ');
var x1 = string2.split(' ');
var x2 = string3.split(' ');
var x3 = string4.split(' ');
if (words.some(v => x1.includes(v))) { // only works for string but not for others
console.log('Yes');
} else {
console.log('No');
}

如何确保我能够有效地匹配句子中的单词,并且我们可以使用诸如"?/等等,我正在考虑将其删除并使用,以使检查更容易。我如何在客户端高效快速地完成这项工作,因为列表可能至少为1k。

这种方法允许任何字符位于搜索字符串的字符之间

let words = ["champ", "winner", "world"];
let strings = [
"I am a champ",
"I am achamp",
"I am a win ner",
"I am not able to figure this",
"w_o_r_l_d"
];
let regexes = [];
for (let element of words) {
regexes.push(element.split('').join('.*?'));
}
let regex = new RegExp(regexes.join('|'));
for (const string of strings) {
if (string.match(regex)) {
console.log('Yes');
} else {
console.log('No');
}
}

输出:对对对不是

这种方法只允许在搜索字符串的字符之间有空格

let words = ["champ", "winner", "world"];
let strings = [
"I am a champ",
"I am achamp",
"I am a win ner",
"I am not able to figure this",
"w_o_r_l_d"
];
let regex = new RegExp(words.join('|'));
for (let i in strings) {
strings[i] = strings[i].replace(/s/g, '');
}
for (const string of strings) {
if (string.match(regex)) {
console.log('Yes');
} else {
console.log('No');
}
}

输出:对对对不无

您可以为一个句子执行一个简单的正则表达式。至于排列,你应该决定一种策略,否则,你可能会面临无限多的选择
例如,字母之间的空格是否计数,如果计数,则两个没有空格的单词也计数,依此类推

不管怎样,下面的代码应该可以完成前两句话:

/**
* 
* @param {String[]} words
* @param {string} sentence
* @returns {boolean}
*/
const match = (words, sentence) => {
return !!words.some(word => {
const regex = new RegExp(word, "i"); // the "i" is for case-insensitive, in case you wish to have it case-sensitive, remove the "i"
return regex.test(sentence)
});
};



const words = ["champ", "winner", "world"];
const string = "I am a champ"; //should matchmatch(words, string)
const string2 = "I am achamp";  //should match
const string3 = "I am a win ner"; // should match winner [or for efficency we can prevent match too, but only last resort]
const  string4 = "I am not able to figure this"; //no
console.log(match(words, string)) // true
console.log(match(words, string2))// true
console.log(match(words, string3))// false - unless you wish to have all the permutations of the original string
console.log(match(words, string4)) //false

words.some(function(v) { return string.indexOf(v) >= 0; })

var words = ["champ", "winner", "world"];
var string = "I am a champ";
var string2 = "I am achamp";
var string3 = "I am a win ner";
var string4 = "I am not able to figure this";

// I want to be able to match the words in string, string1, string2 with the words in the  // list of array efficently as the word array will be huge in size.This is what i have //  // conjured. 

var x = string.split(' ');
var x1 = string2.split(' ');
var x2 = string3.split(' ');
var x3 = string4.split(' ');
if (words.some(function(v) { return string.indexOf(v) >= 0; })) { // only works for string but not for others
console.log('Yes');
} else {
console.log("no");
}

一种可能的方法是从单词列表中创建一个Map,这样检查字符串中的单词是否也在单词列表中的复杂性为O(1(。这将通过以下方式转换为代码:

let map = new Map<string,boolean>();
words.forEach(w => map.set(w,true))
string1.split(' ').some(w => map.has(w))

您可以使用这个库。为了能够找到";win-ner";在使用AhoCorasick算法搜索之前删除所有空白。

var ac = new AhoCorasick(["champ", "winner", "world"]);
var mystring = 'I am a win ner.';
var results = ac.search(mystring.replace(/ /g,''));
console.log(results.length > 0);
<script src="https://cdn.jsdelivr.net/gh/BrunoRB/ahocorasick/src/main.js"></script>

一个非常简单的解决方案,split会检查字符串并检查其长度是否大于1:

var words = ["champ", "winner", "world"];
var string = "I am a champ";
var string2 = "I am achamp";
var string3 = "I am a winner";
var string4 = "I am not able to figure this";
console.log(checkWords(words, string));
console.log(checkWords(words, string2));
console.log(checkWords(words, string3));
console.log(checkWords(words, string4));
function checkWords(words, string) {
var matches = false;
words.forEach(word => {
var split = string.split(word);
if (split.length > 1) {
matches = true;
}
})
return matches;
}

我在目前的任何答案(使用regex的人(中都没有看到这种方法,我认为这是最简单的方法。

RegExp的方法就是答案,这种方法类似于@Shahar的方法。不同的是,我将更多的工作委托给了RegExptest方法,而不是some,后者稍微快一点。

匹配第三句话会带来性能成本。

const match = (regex, sentence) => {
return regex.test(sentence
.replace(/ /g, '') // <-- string3 will not match by removing this line, performance will be better
);
};

const words = ["champ", "winner", "world"];
const string = "I am a champ";    // it match
const string2 = "I am achamp";    // it match
const string3 = "I am a win ner"; // it match
const string4 = "I am not able to figure this"; // no
const regex = new RegExp(words.join('|'), "i");   
console.log(match(regex, string))  // true
console.log(match(regex, string2)) // true
console.log(match(regex, string3)) // true
console.log(match(regex, string4)) // false

台架试验

我在JSBEN测试过。CH当前的3个RegExp答案:@Stan的,@Shahar的和我的在这里

自己检查最好的;(

最新更新