Docker-ubuntu 构建系统

    1. FROM ubuntu
    2. COPY ./get-pip.py /tmp/get-pip.py
    3. #COPY /usr/share/zoneinfoAsia/Shanghai /etc/localtime
    4. COPY ./Shanghai /etc/localtime
    5. RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list \
    6. && apt-get update \
    7. && apt-get -y install software-properties-common \
    8. && apt-get install --reinstall ca-certificates \
    9. && add-apt-repository -y ppa:openjdk-r/ppa \
    10. && add-apt-repository -y ppa:deadsnakes/ppa \
    11. && apt-get -y install openjdk-8-jdk openssh-server vim python3.7 python3-pip python3.7-dev \
    12. && rm -f /usr/bin/python \
    13. && ln -s /usr/bin/python3.7 /usr/bin/python \
    14. && python /tmp/get-pip.py
    15. RUN mkdir /var/run/sshd \
    16. && echo 'root:123456' |chpasswd \
    17. && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config \
    18. && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
    19. ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
    20. EXPOSE 22
    21. CMD ["/usr/sbin/sshd", "-D"]

    Docker 构建spark + hive环境

    1. FROM registry.cn-hangzhou.aliyuncs.com/pqchen/cpq_ubuntu:1.0
    2. COPY ./mysql-connector-java-5.1.45.tar.gz /tmp/mysql-connector-java-5.1.45.tar.gz
    3. COPY ./apache-hive-2.3.7-bin.tar.gz /tmp/apache-hive-2.3.7-bin.tar.gz
    4. COPY ./spark-2.4.7-bin-hadoop2.7.tgz /tmp/spark-2.4.7-bin-hadoop2.7.tgz
    5. COPY ./hadoop-2.7.4.tar.gz /tmp/hadoop-2.7.4.tar.gz
    6. RUN mkdir /shared_data
    7. # -- install Hadoop
    8. #RUN wget -P /tmp https://archive.apache.org/dist/hadoop/common/hadoop-2.7.4/hadoop-2.7.4.tar.gz
    9. RUN tar xf /tmp/hadoop-2.7.4.tar.gz -C /tmp \
    10. && mv /tmp/hadoop-2.7.4 /opt/hadoop \
    11. && rm /tmp/hadoop-2.7.4.tar.gz
    12. ENV HADOOP_HOME /opt/hadoop
    13. ENV HADOOP_COMMON_LIB_NATIVE_DIR $HADOOP_HOME/lib/native
    14. ENV HADOOP_OPTS "-Djava.library.path=$HADOOP_HOME/lib:$HADOOP_COMMON_LIB_NATIVE_DIR"
    15. ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
    16. # -- install spark
    17. #RUN wget -P /tmp https://mirrors.tuna.tsinghua.edu.cn/apache/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
    18. RUN tar xf /tmp/spark-2.4.7-bin-hadoop2.7.tgz -C /tmp \
    19. && mv /tmp/spark-2.4.7-bin-hadoop2.7 /opt/spark \
    20. && rm /tmp/spark-2.4.7-bin-hadoop2.7.tgz \
    21. && echo 'export PATH=$PATH:$SPARK_HOME/bin' >> ~/.bashrc
    22. ENV SPARK_HOME /opt/spark
    23. ENV LD_LIBRARY_PATH $HADOOP_HOME/lib/native/
    24. # -- install Hive
    25. #RUN wget -P /tmp http://apache.claz.org/hive/hive-1.2.2/apache-hive-1.2.2-bin.tar.gz
    26. RUN tar xf /tmp/apache-hive-2.3.7-bin.tar.gz -C /tmp \
    27. && mv /tmp/apache-hive-2.3.7-bin /opt/hive \
    28. && echo 'export PATH=$PATH:$HIVE_HOME/bin' >> ~/.bashrc \
    29. && rm /tmp/apache-hive-2.3.7-bin.tar.gz
    30. ENV HIVE_HOME /opt/hive
    31. # -- install MySQL client jar for Hive
    32. #RUN wget -P /tmp https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-5.1.45.tar.gz
    33. RUN tar xf /tmp/mysql-connector-java-5.1.45.tar.gz -C /tmp \
    34. && cp -r /tmp/mysql-connector-java-5.1.45/mysql-connector-java-5.1.45-bin.jar /opt/hive/lib \
    35. && mv /tmp/mysql-connector-java-5.1.45/mysql-connector-java-5.1.45-bin.jar /opt/spark/jars \
    36. && rm /tmp/mysql-connector-java-5.1.45.tar.gz
    37. EXPOSE 22
    38. CMD ["/usr/sbin/sshd", "-D"]

    部署前要参考docker-compose.yml的挂载路径。其中用到一个hive-site.xml文件,将该文件放置相应位置即可。hive-site.xml文件内容如下。配置了mysql的ip、账号密码等,根据需要修改。

    1. <configuration>
    2. <property>
    3. <name>javax.jdo.option.ConnectionURL</name>
    4. <value>jdbc:mysql://192.168.200.172:13306/hive_metastore?createDatabaseIfNotExist=true&amp;useSSL=false</value>
    5. <description>metadata is stored in a MySQL server</description>
    6. </property>
    7. <property>
    8. <name>javax.jdo.option.ConnectionDriverName</name>
    9. <value>com.mysql.jdbc.Driver</value>
    10. <description>MySQL JDBC driver class</description>
    11. </property>
    12. <property>
    13. <name>javax.jdo.option.ConnectionUserName</name>
    14. <value>root</value>
    15. <description>user name for connecting to mysql server</description>
    16. </property>
    17. <property>
    18. <name>javax.jdo.option.ConnectionPassword</name>
    19. <value>123456</value>
    20. <description>password for connecting to mysql server</description>
    21. </property>
    22. <property>
    23. <name>hive.metastore.warehouse.dir</name>
    24. <value>/home/spark/env/container_data/warehouse</value>
    25. <description>location of default database for the warehouse</description>
    26. </property>
    27. <property>
    28. <name>hive.metastore.schema.verification</name>
    29. <value>false</value>
    30. </property>
    31. </configuration>

    部署可以直接使用docker-compose.yml

    1. version: '2'
    2. services:
    3. mysql:
    4. image: mysql:5.7
    5. container_name: mysql_hive
    6. environment:
    7. MYSQL_ROOT_PASSWORD: 123456
    8. MYSQL_DATABASE: hive
    9. MYSQL_USER: hive
    10. MYSQL_PASSWORD: hive
    11. ports:
    12. - 13306:3306
    13. volumes:
    14. - ./env/container_data/mysql:/var/lib/mysql
    15. restart: always
    16. pyspark:
    17. image: registry.cn-hangzhou.aliyuncs.com/pqchen/pyspark-hive:1.0
    18. container_name: pyspark-hive
    19. ports:
    20. - 14040:4040
    21. - 18080-18090:8080-8090
    22. - 10000:10000
    23. volumes:
    24. - ./env/conf:/opt/spark/conf
    25. - ./env/container_data/spark/warehouse:/shared_data/hive/warehouse
    26. - ./env/table_data:/shared_data/table_data
    27. - ./env/conf/hive-site.xml:/opt/hive/conf/hive-site.xml
    28. - ./:/home/userprofile
    29. links:
    30. - "mysql:mysql"
    31. stdin_open: true
    32. tty: true
    33. working_dir: /home/userprofile
    34. security_opt:
    35. - seccomp=unconfined
    36. cap_add:
    37. - SYS_PTRACE

    hive 初始化

    1. # metastore 初始化
    2. ./schematool -initSchema -dbType mysql
    3. # hive启动 注:不要直接使用hive启动
    4. /opt/hive/bin/hive
    5. # 测试启动成功,没有报错且返回数据库名即可。
    6. show databases;

    image.png

    测试spark 访问hive表

    1. from pyspark.conf import SparkConf
    2. spark = SparkSession.builder.config(conf=SparkConf()).enableHiveSupport().getOrCreate()
    3. sc = spark.sparkContext
    4. spark.sql(" show databases").show()

    image.png