无法连接hive和kafka



我有一个项目,包括训练一个模型,然后使用kafka Topic在hive中存储使用最佳模型的结果。

我尝试了各种配置和解决方案,但都无效

使用的docker-compose文件

version: "3"
services:
namenode:
image: bde2020/hadoop-namenode:1.1.0-hadoop2.8-java8
container_name: namenode
volumes:
- namenode:/hadoop/dfs/name
- ./infra/zeppelin/examples:/opt/sansa-examples
environment:
- CLUSTER_NAME=test
env_file:
- ./infra/hadoop/hadoop-hive.env
ports:
- "50070:50070"
- "8020:8020"
- "8081:8081"
datanode:
image: bde2020/hadoop-datanode:1.1.0-hadoop2.8-java8
container_name: datanode
volumes:
- datanode:/hadoop/dfs/data
env_file:
- ./infra/hadoop/hadoop-hive.env
links:
- namenode
spark-master:
image: bde2020/spark-master:2.1.0-hadoop2.8-hive-java8
container_name: spark-master
ports:
- "8090:800"
- "7077:7077"
environment:
- CORE_CONF_fs_defaultFS=hdfs://namenode:8020
- SPARK_PUBLIC_DNS=localhost
depends_on:
- namenode
- datanode
links:
- namenode
- datanode
spark-worker:
image: bde2020/spark-worker:2.1.0-hadoop2.8-hive-java8
container_name: spark-worker
ports:
- "8083:8083"
environment:
- "SPARK_MASTER=spark://spark-master:7077"
environment:
- CORE_CONF_fs_defaultFS=hdfs://namenode:8020
- SPARK_PUBLIC_DNS=localhost
links:
- spark-master
hue:
image: bde2020/hdfs-filebrowser:3.11
container_name: hue
ports:
- 8088:8088
environment:
- NAMENODE_HOST=namenode
- SPARK_MASTER=spark://spark-master:7077
links:
- spark-master
zeppelin:
image: bde2020/zeppelin:0.0.1-zeppelin-0.7.1-hadoop-2.8.0-spark-2.1.0
container_name: zeppelin
ports:
- 8080:8080
volumes:
- ./data:/data
- ./data:/opt/zeppelin/data
#    - ./infra/zeppelin/conf:/opt/zeppelin/conf
- ./infra/zeppelin/logs:/opt/zeppelin/logs
- ./infra/zeppelin/notebooks:/opt/zeppelin/notebook
- ./infra/zeppelin/examples:/opt/sansa-examples
environment:
CORE_CONF_fs_defaultFS: "hdfs://namenode:8020"
SPARK_MASTER: "spark://spark-master:7077"
MASTER: "spark://spark-master:7077"
SPARK_SUBMIT_OPTIONS: "--jars /opt/sansa-examples/jars/sansa-examples-spark.jar --conf spark.serializer=org.apache.spark.serializer.KryoSerializer"
links:
- spark-master
hive-server:
image: bde2020/hive
container_name: hive-server
env_file:
- ./infra/hadoop/hadoop-hive.env
environment:
- "HIVE_CORE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore/metastore"
links:
- namenode
- hive-metastore
ports:
- 10000:10000
hive-metastore-postgresql:
image: bde2020/hive-metastore-postgresql
container_name: hive-metastore-postgresql
hive-metastore:
image: bde2020/hive
container_name: hive-metastore
env_file:
- ./infra/hadoop/hadoop-hive.env
links:
- namenode
- hive-metastore-postgresql
command: /opt/hive/bin/hive --service metastore
ports:
- 9083:9083
zookeeper:
image: confluentinc/cp-zookeeper
container_name: zookeeper
environment:
ZOOKEEPER_CLIENT_PORT: 2181
volumes:
- zookeeper:/var/lib/zookeeper
kafka:
image: wurstmeister/kafka
container_name: kafka
ports:
- "9092:9092"
environment:
KAFKA_ADVERTISED_HOST_NAME: localhost
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
links:
- zookeeper
depends_on:
- zookeeper  

# kafka:
#   image: confluentinc/cp-kafka
#   container_name: kafka
#   ports:
#     - 9092:9092
#   environment:
#     KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
#     KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
#     KAFKA_NUM_PARTITIONS: 1
#     KAFKA_DEFAULT_REPLICATION_FACTOR: 1
#     KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
#     KAFKA_DELETE_TOPIC_ENABLE: "true"
#   volumes:
#     - kafka:/var/lib/kafka
#   links:
#     - zookeeper
#   depends_on:
#     - zookeeper
nifi:
image: xemuliam/nifi
container_name: nifi
ports:
- 5080:5080
- 5443:8443
- 5081:5081
## for scaling we have to do this
#      - 8080
links:
- zookeeper
- kafka
depends_on:
- zookeeper
- kafka
volumes:
- ./infra/nifi/conf:/opt/nifi/conf
- ./infra/nifi/logs:/opt/nifi/logs
- ./data:/opt/datafiles
- nifi:/opt/nifi/flowfile_repository
- nifi:/opt/nifi/database_repository
- nifi:/opt/nifi/content_repository
- nifi:/opt/nifi/provenance_repository
environment:
ZK_NODES_LIST: zookeeper
IS_CLUSTER_NODE: 1
ELECTION_TIME: 1 min
volumes:
namenode:
datanode:
zookeeper:
kafka:
nifi:

这是hadoop环境

HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
HIVE_SITE_CONF_hive_fetch_task_conversion=none
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031
我的Hadoop版本是:2.7.4我的Hive版本是:2.3.2

这是齐柏林飞艇内部的蜂巢配置输入图片描述

这是误差输入图片描述

您需要将Kafka处理程序JAR添加到Hiveserver2容器类路径中,因为这是真正执行查询的,而不是Zeppelin。

要做到这一点,唯一的方法是在Compose文件中挂载一个卷,在Hive用来读取库的路径下。

否则,只需要使用Spark结构化流,或者你的Nifi容器来写Kafka。hive中只存储元数据。真实的数据存储在Kafka中。另外,我不确定Cloudera是否维护Hive Kafka Handler,而且它似乎也没有发布到Maven Central。