我下面的代码是使用 Spark 从 Hive 表中读取数据。该表中有 1 亿条记录。当我在 Rdd 中选择这么多记录并尝试执行 result.show() 时,它会出现严重的问题异常。
我基本上想通过从此表中为 1 亿条记录集选择几列来在其他表中插入记录。
这是我的代码:
import org.apache.spark.sql.functions._
import org.apache.spark.sql._
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
val result=sqlContext.sql("Select * from ******reception.recp_customer")
result: org.apache.spark.sql.DataFrame = [data_source_id: smallint, customer_bkey: string ... 129 more fields]
result.show()
java.lang.RuntimeException: serious problem
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:1064)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getSplits(OrcInputFormat.java:1091)
at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:202)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:311)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2371)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2773)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2370)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2377)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2113)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2112)
at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2803)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2112)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2327)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:248)
at org.apache.spark.sql.Dataset.show(Dataset.scala:636)
at org.apache.spark.sql.Dataset.show(Dataset.scala:595)
at org.apache.spark.sql.Dataset.show(Dataset.scala:604)
... 52 elided
Caused by: java.util.concurrent.ExecutionException: java.lang.NumberFormatException: For input string: "0000312_0000"
at java.util.concurrent.FutureTask.report(FutureTask.java:122)
at java.util.concurrent.FutureTask.get(FutureTask.java:188)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:1041)
... 94 more
Caused by: java.lang.NumberFormatException: For input string: "0000312_0000"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Long.parseLong(Long.java:441)
at java.lang.Long.parseLong(Long.java:483)
at org.apache.hadoop.hive.ql.io.AcidUtils.parseDelta(AcidUtils.java:323)
at org.apache.hadoop.hive.ql.io.AcidUtils.getAcidState(AcidUtils.java:394)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$FileGenerator.callInternal(OrcInputFormat.java:658)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$FileGenerator$1.run(OrcInputFormat.java:648)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$FileGenerator$1.run(OrcInputFormat.java:645)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:421)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1595)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$FileGenerator.call(OrcInputFormat.java:645)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$FileGenerator.call(OrcInputFormat.java:626)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1152)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:622)
at java.lang.Thread.run(Thread.java:748)
对导致这种情况的原因一无所知。我知道数据集是巨大的如何处理它。
看起来您的hive
表是一个ACID
表。对于只能使用 hive
进行查询acid
表,不能使用 spark
查询它们,因为 Spark 尚不支持此功能。
您可以按照以下JIRA
票进行参考https://issues.apache.org/jira/browse/SPARK-15348
对 Hive 表运行主要压缩,然后尝试从 Spark shell 或代码中读取表,它必须工作。
连接 Hive 后运行
更改表.recp_customer紧凑的"主要";
行:由以下原因引起:java.lang.NumberFormatException:对于输入字符串:"0000312_0000"显示您正在尝试使用字符串值,因为它是数字。看看吧。