如何对具有相同前缀/后缀的列表进行分类?



我有一个单词列表如下:

Data = ['pre_bbc', 'pre_nbc', 'pre_fox', 'bread_post', 'pre_news', 'lucky_post',
'banana_post', 'mike', 'john', 'edward_lear', 'winelistpdf', 'cookbookspdf']

假设我事先不知道前缀或后缀是什么,并且"_"并不总是拆分后缀/前缀的情况,有没有办法使用 Python 将此列表分类为组?假设我想要的结果如下:

List0 = ['pre_bbc', 'pre_nbc', 'pre_fox', 'pre_news']
List1 = ['bread_post', 'lucky_post', 'banana_post']
List2 = ['winelistpdf', 'cookbookspdf']
Orphan_list =['mike', 'john', 'edward_lear']

可能有一些棘手的情况,其中单词同时包含后缀和前缀,例如"pre_voa_post",我认为这可以放入两个列表中。此外,假设所有元素在此列表中都是唯一的。

谢谢!

这是一个非常具有挑战性的问题!如果需要相当普遍,这里有一些条件需要考虑。

  • 贴缀的最小长度
  • 表示词缀的分隔符
  • 多个词缀
import json
def get_affix_groups(words, min=3, delimiter="_"):
"""Get groups from a word list that have matching affixes."""
groups = {}
for word in words:
for item in [w for w in words if w != word]:
for n in range(len(word) - min):
try:
prefix, *_, suffix = word.split(delimiter)
except ValueError:
prefix = word[:n + min]
suffix = word[-(n + min):]
if item.startswith(prefix):
prefix_group = groups.setdefault(prefix, {word})
groups[prefix].add(item)
if item.endswith(suffix):
suffix_group = groups.setdefault(suffix, {word})
groups[suffix].add(item)
all_words = [i for w in groups.values() for i in w]
groups["orphans"] = {word for word in words if word not in all_words}
return groups
data = [
"pre_bbc",
"pre_nbc",
"pre_fox",
"bread_post",
"pre_news",
"lucky_post",
"banana_post",
"mike",
"john",
"edward_lear",
"winelistpdf",
"cookbookspdf",
"pre_voa_post"
]
# Print the resulting dict in a human-readable format
print(json.dumps(get_affix_groups(data), default=list, indent=2))

输出

{
"pre": [
"pre_fox",
"pre_voa_post",
"pre_bbc",
"pre_news",
"pre_nbc"
],
"post": [
"lucky_post",
"pre_voa_post",
"bread_post",
"banana_post"
],
"pdf": [
"cookbookspdf",
"winelistpdf"
],
"orphans": [
"john",
"edward_lear",
"mike"
]
}

如果你真的需要这些变量,你可以使用exec(),但这被认为是不好的做法。

for affix, group in get_affix_groups(data).items():
exec(f"{affix} = {group}")

测试:

Data = ['pre_voa_post', 'argument', 'thermodynamic', 'winelistpdf', 
'pre_bbc', 'anteroom', 'pre_nbc', 'thermostat', 'pre_fox', 
'antedate', 'blabla', 'enchantment', 'pre_news', 'lucky_post', 
'banana_post', 'mike', 'john', 'thermometer', 'toto', 'antenatal' ]

功能

def test(Data):
suffixes = Data.copy()
prefixes = Data.copy()
my_suffixes = {}
my_prefixes = {}
Orphan_list = []
Orphan_s = []
Orphan_p = []
while len(prefixes) > 1:
first_p = prefixes.pop(0)
prefix = ''
for elt_pref in prefixes:
i = min(len(first_p), len(elt_pref))
while i > 1:
if first_p[0:i] == elt_pref[0:i]:
prefix = first_p[0:i]
my_prefixes[prefix] = [first_p, elt_pref, ]
prefixes.remove(elt_pref)
var = 0
while var < len(prefixes):
sec_elt = prefixes[var]
if sec_elt.startswith(prefix):
my_prefixes[prefix].append(sec_elt)
prefixes.remove(sec_elt)
else:
var += 1
break
else:
i -= 1
if prefix == '':
Orphan_p.append(first_p)
if prefixes:
Orphan_p.append(prefixes[0])
while len(suffixes) > 1:
first_s = suffixes.pop(0)
suffix = ''
for elt_suf in suffixes:
j = min(len(first_s), len(elt_suf))
while j > 2:
if first_s[-j:] == elt_suf[-j:]:
suffix = first_s[-j:]
my_suffixes[suffix] = [first_s, elt_suf, ]
suffixes.remove(elt_suf)
var = 0
while var < len(suffixes):
elt_suf3 = suffixes[var]
if elt_suf3.endswith(suffix):
my_suffixes[suffix].append(elt_suf3)
suffixes.remove(elt_suf3)
else:
var += 1
break
else:
j -= 1
if suffix == '':
Orphan_s.append(first_s)
if suffixes:
Orphan_s.append(suffixes[0])
Orphan_list = list(set(Orphan_p) & set(Orphan_s))
print("my_suffixes", my_suffixes)
print("my_prefixes", my_prefixes)
print("Orphan_list", Orphan_list)

结果:

my_suffixes {'_post': ['pre_voa_post', 'bread_post', 'lucky_post', 'banana_post'], 
'ment': ['argument', 'enchantment'], 
'pdf': ['winelistpdf', 'cookbookspdf']}
my_prefixes {'pre_': ['pre_voa_post', 'pre_bbc', 'pre_nbc', 'pre_fox', 'pre_news'],
'thermo': ['thermodynamic', 'thermostat', 'thermometer'], 
'ante': ['anteroom', 'antedate', 'antenatal']}

Orphan_list ['toto', 'mike', 'john', 'blabla', 'edward_lear']

这应该不是一个有效的问题,但是:

def partition(list_of_pref, list_of_words):
ret_list = []
for l in list_of_pref:
this_list = []
ret_list.append(this_list)
for word in list_of_words:
if word.startswith(l):
this_list.append(word)

partition(['pre', 'banana'],['pre_bbc', 'pre_nbc', 'pre_fox', 'bread_post', 'pre_news', 'lucky_post', 'banana_post', 'mike', 'john', 'edward_lear', 'winelistpdf', 'cookbookspdf'])                                                               
Out[4]: [['pre_bbc', 'pre_nbc', 'pre_fox', 'pre_news'], ['banana_post']]
return ret_list

对首选项列表执行相同的操作,或者通过在函数内部迭代拆分('_')来生成它们,您就完成了

最新更新