PySpark:如何根据PySpark数据框中的值拆分数组,也反映了与数组类型对应的另一列相同 &g



我有一个Pyspark数据框架:

<表类> id 名称 tbody><<tr>[1, 1, 2, 3, 1、2、3、7、5][a, b, c、l、s、o、c, d, e)[3、8、9、3、9、0,0,6,7,8)[s、l、h p, q, g、c、d, p, s][9、6、5、4、7、6、5、9、2、5、5、4、7][q,, z, w,年代,e, r, t, y, o, p, a, x)

rdd+map的快速解

def split(r):
A, B = [], []
a, b = [], []
for x, y in zip(*r):
if x != 7:
a.append(x)
b.append(y)
else:
A.append([*a, x])
B.append([*b, y])
a, b = [], [] # reset
return [*r, A, B]
result = df.rdd.map(split).toDF(['ids', 'names', 'ids_splited', 'names_splited'])

结果

+---------------------------------------+---------------------------------------+-------------------------------------------+-------------------------------------------+
|ids                                    |names                                  |ids_splited                                |names_splited                              |
+---------------------------------------+---------------------------------------+-------------------------------------------+-------------------------------------------+
|[1, 1, 2, 3, 1, 2, 3, 7, 5]            |[a, b, c, l, s, o, c, d, e]            |[[1, 1, 2, 3, 1, 2, 3, 7]]                 |[[a, b, c, l, s, o, c, d]]                 |
|[3, 8, 9, 3, 9, 0, 0, 6, 7, 8]         |[s, l, h, p, q, g, c, d, p, s]         |[[3, 8, 9, 3, 9, 0, 0, 6, 7]]              |[[s, l, h, p, q, g, c, d, p]]              |
|[9, 6, 5, 4, 7, 6, 5, 9, 2, 5, 5, 4, 7]|[q, a, z, w, s, e, r, t, y, o, p, a, x]|[[9, 6, 5, 4, 7], [6, 5, 9, 2, 5, 5, 4, 7]]|[[q, a, z, w, s], [e, r, t, y, o, p, a, x]]|
+---------------------------------------+---------------------------------------+-------------------------------------------+-------------------------------------------+

最新更新