我有一个架构为的结构
root
|-- id: long (nullable = true)
|-- products: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: integer (nullable = true)
| | |-- name: string (nullable = true)
| | |-- created_at: long (nullable = true)
| | |-- updated_at: long (nullable = true)
| | |-- product_color: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- id: integer (nullable = true)
| | | | |-- color: string (nullable = true)
| | | | |-- created_at: long (nullable = true)
| | | | |-- updated_at: long (nullable = true)
| | | | |-- products_id: long (nullable = true)
| | |-- orders_id: long (nullable = true)
现在,我想用product_color创建一个新列,所以在我的数据框架中,我添加了一个像这样的新列
df.withColumn("product_color", col(currentNode + "." + fieldName))
有了新的列模式:
root
|-- id: long (nullable = true)
|-- products: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: integer (nullable = true)
| | |-- name: string (nullable = true)
| | |-- created_at: long (nullable = true)
| | |-- updated_at: long (nullable = true)
| | |-- product_color: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- id: integer (nullable = true)
| | | | |-- color: string (nullable = true)
| | | | |-- created_at: long (nullable = true)
| | | | |-- updated_at: long (nullable = true)
| | | | |-- products_id: long (nullable = true)
| | |-- orders_id: long (nullable = true)
|-- product_color: array (nullable = true)
| |-- element: array (containsNull = true)
| | |-- element: struct (containsNull = true)
| | | |-- id: integer (nullable = true)
| | | |-- color: string (nullable = true)
| | | |-- created_at: long (nullable = true)
| | | |-- updated_at: long (nullable = true)
| | | |-- products_id: long (nullable = true)
如果查看product_color的模式,则会添加数组的一个元素。
|-- element: array (containsNull = true)
我正在寻求帮助,以了解如何使用产品结构中的确切模式创建新列。
预期架构:
root
|-- id: long (nullable = true)
|-- products: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: integer (nullable = true)
| | |-- name: string (nullable = true)
| | |-- created_at: long (nullable = true)
| | |-- updated_at: long (nullable = true)
| | |-- product_color: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- id: integer (nullable = true)
| | | | |-- color: string (nullable = true)
| | | | |-- created_at: long (nullable = true)
| | | | |-- updated_at: long (nullable = true)
| | | | |-- products_id: long (nullable = true)
| | |-- orders_id: long (nullable = true)
|-- product_color: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: integer (nullable = true)
| | |-- color: string (nullable = true)
| | |-- created_at: long (nullable = true)
| | |-- updated_at: long (nullable = true)
| | |-- products_id: long (nullable = true)
火花:2.4.5语言:Scala
添加为new column
时,分解数组以获得所需的模式。
Example:
//sample df schema
df.printSchema
//root
// |-- id: long (nullable = true)
// |-- products: array (nullable = true)
// | |-- element: struct (containsNull = true)
// | | |-- id: long (nullable = true)
// | | |-- order_id: long (nullable = true)
// | | |-- product_color: array (nullable = true)
// | | | |-- element: struct (containsNull = true)
// | | | | |-- color: string (nullable = true)
// | | | | |-- id: long (nullable = true)
// | | | | |-- products_id: long (nullable = true)
df.withColumn("product_color",explode(col("products.product_color"))).printSchema
//root
// |-- id: long (nullable = true)
// |-- products: array (nullable = true)
// | |-- element: struct (containsNull = true)
// | | |-- id: long (nullable = true)
// | | |-- order_id: long (nullable = true)
// | | |-- product_color: array (nullable = true)
// | | | |-- element: struct (containsNull = true)
// | | | | |-- color: string (nullable = true)
// | | | | |-- id: long (nullable = true)
// | | | | |-- products_id: long (nullable = true)
// |-- product_color: array (nullable = true)
// | |-- element: struct (containsNull = true)
// | | |-- color: string (nullable = true)
// | | |-- id: long (nullable = true)
// | | |-- products_id: long (nullable = true)