我输入一个数组,然后我想获得它们的unicode并存储到数据帧中。这是我的代码
def getUnicodeOfEmoji (emojiArray : Array[String]) : DataFrame = {
val existingSparkSession = SparkSession.builder().getOrCreate()
import existingSparkSession.implicits._
var result: DataFrame = null
var df : DataFrame = null
for (i <- 0 until emojiArray.length) {
df = Seq(emojiArray(i)).toDF("emoji")
df.show()
result = df.selectExpr(
"emoji",
"'U+' || trim('0' , string(hex(encode(emoji, 'utf-32')))) as result"
)
}
result.show(false)
return result
}
}
input = val emojis="😃😜😍"
实际输出
|emoji|result |
+-----+-------+
|😍 |U+1F60D|
+-----+-------+
但是我需要在数据框架内拥有所有3个表情符号及其特定的unicode。
不需要for循环来构造数据框架。您可以将数组转换为Seq
,并使用Seq
的toDF
方法来构造结果数据帧。
def getUnicodeOfEmoji (emojiArray : Array[String]) : DataFrame = {
val existingSparkSession = SparkSession.builder().getOrCreate()
import existingSparkSession.implicits._
val df = emojiArray.toSeq.toDF("emoji")
val result = df.selectExpr(
"emoji",
"'U+' || trim('0' , string(hex(encode(emoji, 'utf-32')))) as result"
)
result.show(false)
return result
}
val emojis = "😃😜😍"
val input = raw"p{block=Emoticons}".r.findAllIn(emojis).toArray
val converted = getUnicodeOfEmoji(input)
+-----+-------+
|emoji|result |
+-----+-------+
|😃 |U+1F603|
|😜 |U+1F61C|
|😍 |U+1F60D|
+-----+-------+
一个小小的改进是在输入到函数之前将你的表情字符串直接转换为Seq[String]
,例如
def getUnicodeOfEmoji (emojiArray : Seq[String]) : DataFrame = {
val existingSparkSession = SparkSession.builder().getOrCreate()
import existingSparkSession.implicits._
val df = emojiArray.toDF("emoji")
val result = df.selectExpr(
"emoji",
"'U+' || trim('0' , string(hex(encode(emoji, 'utf-32')))) as result"
)
result.show(false)
return result
}
val emojis = "😃😜😍"
val input = raw"p{block=Emoticons}".r.findAllIn(emojis).toSeq
val converted = getUnicodeOfEmoji(input)