data=[(("James","Bond"),["Java","C#"],{'hair':'black','eye':'brown'}),
(("Ann","Varsa"),[".NET","Python"],{'hair':'brown','eye':'black'}),
(("Tom Cruise",""),["Python","Scala"],{'hair':'red','eye':'grey'}),
(("Tom Brand",None),["Perl","Ruby"],{'hair':'black','eye':'blue'})]
schema = ['n','ln','p']
df = spark.createDataFrame(data,schema=schema)
+-----------------+---------------+--------------------+
| n| ln| p|
+-----------------+---------------+--------------------+
| {James, Bond}| [Java, C#]|{eye -> brown, ha...|
| {Ann, Varsa}| [.NET, Python]|{eye -> black, ha...|
| {Tom Cruise, }|[Python, Scala]|{eye -> grey, hai...|
|{Tom Brand, null}| [Perl, Ruby]|{eye -> blue, hai...|
+-----------------+---------------+--------------------+
name = df.select('n')
我尝试了filter方法在一个单独的列中获取第一个和第二个名称,但没有成功。
所需输出:
first |last
-----------
James |Bond
Tom |Cruise
Tom |Brand
列n
的数据类型为struct
。您可以将结构列转换为数组,然后使用空格作为分隔符" "
连接数组的所有元素。然后,您可以使用相同的分隔符获取split
此列之后的第一个和第二个单词。
from pyspark.sql import functions as F
col_joined = F.array_join(F.array("n.*"), " ")
df = df.select(
F.split(col_joined," ")[0].alias("first"),
F.split(col_joined," ")[1].alias("last"),
)
df.show()
# +-----+------+
# |first| last|
# +-----+------+
# |James| Bond|
# | Ann| Varsa|
# | Tom|Cruise|
# | Tom| Brand|
# +-----+------+
changed the schema according to the struct now I got another solution.
data=[(("James","Bond"),["Java","C#"],{'hair':'black','eye':'brown'}),
(("Ann","Varsa"),[".NET","Python"],{'hair':'brown','eye':'black'}),
(("Tom Cruise",""),["Python","Scala"],{'hair':'red','eye':'grey'}),
(("Tom Brand",None),["Perl","Ruby"],{'hair':'black','eye':'blue'})]
schema2 = StructType([
StructField('name',StructType([
StructField('fname',StringType(),True),
StructField('lname',StringType(),True)
])),
StructField('lng',ArrayType(StringType()),True),
StructField('properties', MapType(StringType(),StringType()),True)
])
df = spark.createDataFrame(data,schema=schema2)
df.select("name.fname","name.lname").show()
+----------+-----+
| fname|lname|
+----------+-----+
| James| Bond|
| Ann|Varsa|
|Tom Cruise| |
| Tom Brand| null|
+----------+-----+