如何将一大段文本缩减为与搜索相关的信息。
例如,假设我有一个段落,我的搜索是efficitur eget
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec placerat libero id mi facilisis, at sagittis tortor porta. Donec eget sodales ipsum. Donec sagittis lacus mauris, et efficitur quam porttitor eu. Fusce eget consequat purus. Maecenas rutrum arcu viverra est rhoncus, et hendrerit tellus elementum. Aenean ornare dolor tempus ante porta, sit amet convallis lacus rutrum. Maecenas bibendum magna tortor. Vestibulum tortor nunc, dictum vitae nisl quis, pharetra mattis massa. Vestibulum vulputate leo eros, eget maximus ipsum tristique quis. Quisque rutrum vel felis eget feugiat. Etiam interdum nisi ac nibh egestas malesuada. Mauris fringilla nisi id rutrum fermentum. Ut ultrices ipsum rutrum, hendrerit urna non, dapibus ligula. Vivamus rhoncus eros eget eros feugiat volutpat. In ac arcu at purus porta varius. Sed commodo diam a ipsum vestibulum, et sagittis sem consectetur.
是否可以很容易地将文本缩减为一个包含efficitur
和eget
的句子,而不显示整个段落?
... Donec sagittis lacus mauris, et efficitur quam porttitor eu. Fusce eget consequat purus. Maecenas rutrum ...
目前我的puesdo想法是
// Find strpos of search words
// Make positions unique
// Find words closest together within X characters
// Allow for words on LEFT and RIGHT of keyword
// .. Continue until every keyword has lapsed
// Add "dots" to LEFT or/and RIGHT of the result
// implode
// return
但如果这已经完成了,或者PHP是否有这样的功能,我宁愿不重新发明轮子。
我已经编写了自己的函数,可以将大段落转换为小句子
function reduce_max_word_contents($content, $keywords, $exact, $max_words, $dots)
{
if (is_array($keywords) == false) {
$keywords = (array) $keywords;
$keywords = array_filter($keywords);
}
$format_content = $content;
$format_content = trim($format_content);
if (empty($format_content)) {
// trigger_error("No Content Given");
return "";
}
if (empty($keywords)) {
// trigger_error("No Keywords Given");
return $format_content;
}
if (!$max_words) {
// trigger_error("No Max Words Set");
return $format_content;
}
$format_content_word_s = $format_content;
$format_content_word_s = explode(' ', $format_content_word_s);
$format_content_word_s = (array) $format_content_word_s;
if (empty($format_content_word_s)) {
// trigger_error("No Words Given");
return $format_content;
}
$words_exceed_max = true;
$words_exceed_max = ($words_exceed_max && !empty($format_content_word_s));
$words_exceed_max = ($words_exceed_max && (count($format_content_word_s) > $max_words));
$words_exceed_max = (bool) $words_exceed_max;
if (!$words_exceed_max) {
return $format_content;
}
$format_lower_words = $format_content_word_s;
$format_lower_words = array_map('strtolower', $format_lower_words);
$format_lower_words = array_map('trim', $format_lower_words);
$format_lower_words = (array) $format_lower_words;
if (empty($format_lower_words)) {
return $format_content;
}
$keyword_indexes = array();
foreach ($keywords as $key => $keyword) {
$keyword_lower = $keyword;
$keyword_lower = trim($keyword_lower);
$keyword_lower = strtolower($keyword_lower);
$keyword_pos = false;
if ($exact) {
$keyword_pos = array_search($keyword_lower, $format_lower_words);
} else {
foreach ($format_lower_words as $f_key => $f_word) {
$f_is_match = true;
$f_is_match = ($f_is_match && strstr($f_word, $keyword_lower));
$f_is_match = (bool) $f_is_match;
if ($f_is_match) {
$keyword_pos = $f_key;
break;
}
}
}
if (is_numeric($keyword_pos) == false) {
continue;
}
$keyword_indexes[$key] = $keyword_pos;
}
if (empty($keyword_indexes)) {
return $format_content;
}
$keyword_side_s = array();
foreach (array_keys($keyword_indexes) as $k_key => $k_index) {
$k_position = $keyword_indexes[$k_index];
$k_position = intval($k_position);
$left_slice = array();
$left_slice['offset'] = $k_position > $max_words ? $k_position - $max_words : 0;
$left_slice['len'] = $k_position > $max_words ? $max_words : $k_position;
if ($k_position > 0) {
$array_left = array_slice($format_content_word_s, $left_slice['offset'], $left_slice['len'], true);
$array_left = (array) $array_left;
} else {
$array_left = array();
}
$right_slice = array();
$right_slice['offset'] = $k_position + 1;
$right_slice['len'] = $max_words - 1;
$array_right = array_slice($format_content_word_s, $right_slice['offset'], $right_slice['len'], true);
$array_right = (array) $array_right;
$keyword_sides = array();
$keyword_sides['left'] = $array_left;
$keyword_sides['right'] = $array_right;
$s_result = array();
$keywords_side_loop = array();
$keywords_side_loop = array_keys($keyword_indexes);
$keywords_side_loop = (array) $keywords_side_loop;
foreach ($keywords_side_loop as $x_key) {
$x_is_k = true;
$x_is_k = ($x_is_k && ($k_index == $x_key));
$x_is_k = (bool) $x_is_k;
if ($x_is_k) {
continue;
}
$x_key_pos = $keyword_indexes[$x_key];
foreach ($keyword_sides as $kw_s_key => $kw_s_values) {
if (array_key_exists($kw_s_key, $s_result)) {
continue;
}
$kw_s_is_valid = true;
$kw_s_is_valid = ($kw_s_is_valid && !empty($kw_s_values));
$kw_s_is_valid = ($kw_s_is_valid && !array_key_exists($x_key_pos, $kw_s_values));
$kw_s_is_valid = (bool) $kw_s_is_valid;
if ($kw_s_is_valid) {
$s_result[$kw_s_key] = $kw_s_values;
} else {
$s_result[$kw_s_key] = array();
}
}
}
if (empty($s_result)) {
$s_result = $keyword_sides;
}
$create_right_slice = true;
$create_right_slice = ($create_right_slice && empty($s_result['right']));
$create_right_slice = ($create_right_slice && isset($keyword_indexes[$k_index + 1]));
$create_right_slice = (bool) $create_right_slice;
// $create_right_slice = true; // good debug point
if ($create_right_slice) {
$right_word_slice = array_slice($format_content_word_s, $k_position + 1, $keyword_indexes[$k_index + 1] - 1, true);
$right_word_slice = (array) $right_word_slice;
} else {
$right_word_slice = array();
}
if ($right_word_slice && !empty($right_word_slice)) {
$s_result['connect'] = $right_word_slice;
} else {
$s_result['connect'] = array();
}
$keyword_side_s[$k_position] = $s_result;
}
if (empty($keyword_side_s)) {
return $format_content;
}
$first_key = $keyword_side_s;
reset($first_key);
$first_key = key($first_key);
$keyword_side_s_keys = array();
$keyword_side_s_keys['start'] = $first_key;
$keyword_side_s_keys['end'] = array_pop(array_keys($keyword_side_s));
$keyword_result_s = array();
foreach (array_keys($keyword_side_s) as $ks_key => $ks_position) {
$ks_sides = $keyword_side_s[$ks_position];
$ks_sides = (array) $ks_sides;
$section_left_dots = !empty($keyword_result_s) ? $dots : "";
$section_left_dots = (string) $section_left_dots;
$section_right_dots = array_keys($keyword_side_s);
$section_right_dots = isset($section_right_dots[$ks_key + 1]);
$section_right_dots = $section_right_dots ? $dots : "";
$section_right_dots = (string) $section_right_dots;
$ks_word = $format_content_word_s[$ks_position];
$ks_word = (string) $ks_word;
$keyword_section = array();
if (!empty($ks_sides['left'])) {
$keyword_section[] = $section_left_dots;
$keyword_section[] = implode(' ', $ks_sides['left']);
}
$keyword_section[] = $ks_word;
if (!empty($ks_sides['connect'])) {
$keyword_section[] = implode(' ', $ks_sides['connect']);
}
if (!empty($ks_sides['right'])) {
$keyword_section[] = implode(' ', $ks_sides['right']);
$keyword_section[] = $section_right_dots;
}
$keyword_section_s = $keyword_section;
$keyword_section_s = array_map('trim', $keyword_section_s);
$keyword_section_s = array_filter($keyword_section_s);
$keyword_section_s = (array) $keyword_section_s;
if (empty($keyword_section_s)) {
continue;
}
$keyword_result_s = array_merge($keyword_result_s, $keyword_section_s);
$keyword_result_s = (array) $keyword_result_s;
}
$keyword_result_str = $keyword_result_s;
$keyword_result_str = array_map('trim', $keyword_result_str);
$keyword_result_str = array_filter($keyword_result_str);
$keyword_result_str = array_unique($keyword_result_str);
$keyword_result_str = implode(' ', $keyword_result_str);
if (empty($keyword_result_str)) {
return $format_content;
}
if (!empty($keyword_side_s[$keyword_side_s_keys['start']]['left'])) {
$keyword_result_str = $dots . $keyword_result_str;
}
if (!empty($keyword_side_s[$keyword_side_s_keys['end']]['right'])) {
$keyword_result_str = $keyword_result_str . $dots;
}
return $keyword_result_str;
}
这是一个片段,它将处理一个或多个指针,使用不区分大小写和单词边界搜索草堆字符串,找到包含最独特匹配项的最短字符串,然后隔离第一个单词和最后一个单词所在的整个前导句和尾句。
代码:(演示(
$text = <<<TEXT
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec placerat libero id mi facilisis, at sagittis tortor porta. Donec eget sodales ipsum. Donec sagittis lacus mauris, et efficitur quam porttitor eu. Fusce eget consequat purus. Maecenas rutrum arcu viverra est rhoncus, et hendrerit tellus elementum. Aenean ornare dolor tempus ante porta, sit amet convallis lacus rutrum. Maecenas bibendum magna tortor. Vestibulum tortor nunc, dictum vitae nisl quis, pharetra mattis massa. Vestibulum vulputate leo eros, eget maximus ipsum tristique quis. Quisque rutrum vel felis eget feugiat. Etiam interdum nisi ac nibh egestas malesuada. Mauris fringilla nisi id rutrum fermentum. Ut ultrices ipsum rutrum, hendrerit urna non, dapibus ligula. Vivamus rhoncus eros eget eros feugiat volutpat. In ac arcu at purus porta varius. Sed commodo diam a ipsum vestibulum, et sagittis sem consectetur.
TEXT;
$needles = [
'efficitur',
'eget',
'Ipsum',
];
// sort longer strings first
array_multisort(
array_map('strlen', $needles),
SORT_DESC,
$needles
);
// build pattern with word boundaries and an atomic group to ensure longer strings are prioritized over shorter strings
$pattern = '#b(?>' . implode('|', array_map('preg_quote', $needles)) . ')b#i';
// get all matches and their offsets
if (preg_match_all($pattern, $text, $m, PREG_OFFSET_CAPTURE)) {
$narrowestWordOffsets = [];
$fullSetCount = null;
$totalLength = null;
// get unique combinations
foreach ($m[0] as $i => [$w, $o]) {
$set = [$w => $o];
foreach (array_slice($m[0], $i + 1) as [$w2, $o2]) {
if (!isset($set[$w2])) {
$set[$w2] = $o2;
$lastWord = $w2;
}
}
$setLength = strlen($lastWord) + $set[$lastWord] - current($set);
// store the best qualifying combinations of found words
if (
!$narrowestWordOffsets
|| (
$setLength < $totalLength
&& count($set) === $fullSetCount
)
) {
$narrowestWordOffsets = $set;
$totalLength = $setLength;
$fullSetCount ??= count($set);
}
}
// Isolate the desired output string and add optional ellipses
var_export(
preg_replace_callback(
'#(.*?)([^.]*' . preg_quote(substr($text, current($narrowestWordOffsets), $totalLength)) . '[^.]*.?)(.*)#',
fn($m) =>
(strlen($m[1]) ? '...' : '')
. $m[2]
. (strlen($m[3]) ? ' ...' : ''),
$text
)
);
} else {
echo 'No needles found in haystack';
}