我有一个Pyspark数据框架:
<表类>
id
名称
tbody><<tr>[1, 1, 2, 3, 1、2、3、7、5] [a, b, c、l、s、o、c, d, e) [3、8、9、3、9、0,0,6,7,8) [s、l、h p, q, g、c、d, p, s] [9、6、5、4、7、6、5、9、2、5、5、4、7] [q,, z, w,年代,e, r, t, y, o, p, a, x) 表类>
rdd
+map
的快速解
def split(r):
A, B = [], []
a, b = [], []
for x, y in zip(*r):
if x != 7:
a.append(x)
b.append(y)
else:
A.append([*a, x])
B.append([*b, y])
a, b = [], [] # reset
return [*r, A, B]
result = df.rdd.map(split).toDF(['ids', 'names', 'ids_splited', 'names_splited'])
结果
+---------------------------------------+---------------------------------------+-------------------------------------------+-------------------------------------------+
|ids |names |ids_splited |names_splited |
+---------------------------------------+---------------------------------------+-------------------------------------------+-------------------------------------------+
|[1, 1, 2, 3, 1, 2, 3, 7, 5] |[a, b, c, l, s, o, c, d, e] |[[1, 1, 2, 3, 1, 2, 3, 7]] |[[a, b, c, l, s, o, c, d]] |
|[3, 8, 9, 3, 9, 0, 0, 6, 7, 8] |[s, l, h, p, q, g, c, d, p, s] |[[3, 8, 9, 3, 9, 0, 0, 6, 7]] |[[s, l, h, p, q, g, c, d, p]] |
|[9, 6, 5, 4, 7, 6, 5, 9, 2, 5, 5, 4, 7]|[q, a, z, w, s, e, r, t, y, o, p, a, x]|[[9, 6, 5, 4, 7], [6, 5, 9, 2, 5, 5, 4, 7]]|[[q, a, z, w, s], [e, r, t, y, o, p, a, x]]|
+---------------------------------------+---------------------------------------+-------------------------------------------+-------------------------------------------+