在pyspark
中,如何根据特定字典键的值过滤具有字典列表列的dataframe
?
也就是说,筛选其foo_data
字典在我的列表中具有任何值的行name
属性。
# The dataframe
# df.show()
foo_data bar_id
0 [{'name': 'Foo 1'}, {'name': 'Foo 2'}] 42189321899fewa32
1 [{'name': 'Foo 1'}, {'name': 'Foo 3'}] 13829a38291dm2198
2 [{'name': 'Foo 2'}, {'name': 'Foo 3'}] 3910m312091412812
3 [{'name': 'Foo 2'}, {'name': 'Foo 4'}] 2189d2n18u9218219
# The values for the "name" key in the dictionaries of the column "foo_data"
foo_list = [
"Foo 1",
"Foo 4"
]
# df_filtered = df.filter...?
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType, BooleanType
#Creating a DataFrame
df = spark.createDataFrame(
[([{'name': 'Foo 1'}, {'name': 'Foo 2'}],'42189321899fewa32'),
([{'name': 'Foo 1'}, {'name': 'Foo 3'}],'13829a38291dm2198'),
([{'name': 'Foo 2'}, {'name': 'Foo 4'}],'2189d2n18u9218219'),
([{'name': 'Foo 2'}, {'name': 'Foo 3'}],'239d2n18u92154619'),],
schema = ['foo_data','bar_id']
)
foo_list = [ "Foo 1", "Foo 4"]
df.show(truncate=False)
+----------------------------------------+-----------------+
|foo_data |bar_id |
+----------------------------------------+-----------------+
|[Map(name -> Foo 1), Map(name -> Foo 2)]|42189321899fewa32|
|[Map(name -> Foo 1), Map(name -> Foo 3)]|13829a38291dm2198|
|[Map(name -> Foo 2), Map(name -> Foo 4)]|2189d2n18u9218219|
|[Map(name -> Foo 2), Map(name -> Foo 3)]|239d2n18u92154619|
+----------------------------------------+-----------------+
#Creating a UDF of a function
def list_values(col):
list_all_values = [i['name'] for i in col]
return any((True for x in list_all_values if x in foo_list))
list_values_udf = udf(list_values, BooleanType())
# Finally filtering all rows which had even one of the values from
# the user given 'foo_list' values of dictionary in 'foo_data' column.
df = df.withColumn('bool', list_values_udf(df.foo_data)).filter(col('bool')==True).drop('bool')
df.show(truncate=False)
+----------------------------------------+-----------------+
|foo_data |bar_id |
+----------------------------------------+-----------------+
|[Map(name -> Foo 1), Map(name -> Foo 2)]|42189321899fewa32|
|[Map(name -> Foo 1), Map(name -> Foo 3)]|13829a38291dm2198|
|[Map(name -> Foo 2), Map(name -> Foo 4)]|2189d2n18u9218219|
+----------------------------------------+-----------------+