使用用户定义的比较函数在两个Pandas数据框之间查找



我有两个Pandas数据框架(示例是说明性的)。列df1['list_of_keywords']df2['list_of_words']各包含单词列表。

df1 = pd.DataFrame(columns=('some_data','another_data','list_of_keywords'))
df2= pd.DataFrame(columns=('something','something_more','something_else','list_of_words'))

df1:

list_of_keywords['word1', 'word2', 'word3']['word7', 'word8', 'word7']

你可以试试:

# Setup
import pandas as pd
df1 = pd.DataFrame(
{
"some_data": ["id0001", "id0002", "id0003"],
"another_data": [12391, 3233, 3426],
"list_of_keywords": [
["word1", "word2", "word3"],
["word7", "word8", "word7"],
["word1", "word2", "word4"],
],
}
)
df2 = pd.DataFrame(
{
"something": ["id_abcd", "id_eeed", "id_dgef"],
"something_more": ["ref34322", "ref5555", "ref2963"],
"something_else": ["some comment", "some comment", "some comment"],
"list_of_words": [
["word5", "word4", "word5", "word4", "word9"],
["word5", "word3", "word2", "word4", "word1"],
["word1", "word2", "word3", "word4", "word6"],
],
}
)
# Data preparation
lists_of_keywords = df1["list_of_keywords"].values
lists_of_words = df2["list_of_words"].values
# Iterate to find a match
match = {"in_df2": []}
for list_of_keywords in lists_of_keywords:
search = []
for list_of_words in lists_of_words:
if set(list_of_keywords).issubset(set(list_of_words)):
search.append(True)
else:
search.append(False)
if any(search):
match["in_df2"].append("True")
else:
match["in_df2"].append("False")
df1["in_df2"] = pd.DataFrame(match)
print(df1)
# Outputs
some_data  another_data       list_of_keywords in_df2
0    id0001         12391  [word1, word2, word3]   True
1    id0002          3233  [word7, word8, word7]  False
2    id0003          3426  [word1, word2, word4]   True

相关内容

  • 没有找到相关文章

最新更新