0. 目标

  1. 搭建一个有3台hadoop的集群,包括 (NN + 2NN + historyserver)
  2. hd01, hd02, hd03 分别安排:

    • hd01: NameNode + JobHistoryServer + NodeManager + DataNode
    • hd02: NodeManager + DataNode + ResourceManager
    • hd03: SecondaryNameNode + NodeManager + DataNode

      1. 各文件准备

  3. dockerfileHDFS

  4. Makefile
  5. docker-compose.yml
  6. element/ 目录
    1. configure/
      1. hadoop/
        1. etc/
    2. images/
      1. hadoop-3.3.0.tar.gz
      2. jdk-8u141-linux-x64.tar.gz
    3. mysh/
      1. syncSshKeygen.sh

        1. dockerfileHDFS

        docker build -t my-hadoop:3.3.0 -f dockerfileHDFS .

  1. FROM centos:7
  2. # SSH
  3. RUN yum install -y openssh-server sudo
  4. RUN sed -i 's/UsePAM yes/UsePAM no/g' /etc/ssh/sshd_config
  5. RUN echo "PermitRootLogin yes" >> /etc/ssh/sshd_config
  6. RUN yum install -y openssh-clients
  7. RUN echo "root:123456" | chpasswd
  8. RUN echo "root ALL=(ALL) ALL" >> /etc/sudoers
  9. RUN ssh-keygen -t dsa -f /etc/ssh/ssh_host_dsa_key
  10. RUN ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key
  11. RUN mkdir /var/run/sshd
  12. # JAVA
  13. ADD element/images/jdk-8u141-linux-x64.tar.gz /usr/local/
  14. RUN mv /usr/local/jdk1.8.0_141 /usr/local/jdk1.8
  15. ENV JAVA_HOME /usr/local/jdk1.8
  16. ENV PATH $JAVA_HOME/bin:$PATH
  17. # HADOOP
  18. ADD element/images/hadoop-3.3.0.tar.gz /usr/local
  19. RUN mv /usr/local/hadoop-3.3.0 /usr/local/hadoop
  20. ENV HADOOP_HOME /usr/local/hadoop
  21. ENV PATH $HADOOP_HOME/bin:$PATH
  22. RUN yum install -y which sudo
  23. RUN mkdir /mysh
  24. EXPOSE 22
  25. CMD ["/usr/sbin/sshd", "-D"]

2. Makefile

.PHONY: config
config:
    sudo rm -rf element/configure/hadoop/etc-hd01 element/configure/hadoop/etc-hd02 element/configure/hadoop/etc-hd03
    sudo mkdir -p element/configure/hadoop/etc-hd01 element/configure/hadoop/etc-hd02 element/configure/hadoop/etc-hd03
    sudo cp -r element/configure/hadoop/etc/* element/configure/hadoop/etc-hd01/
    sudo cp -r element/configure/hadoop/etc/* element/configure/hadoop/etc-hd02/
    sudo cp -r element/configure/hadoop/etc/* element/configure/hadoop/etc-hd03/

.PHONY: up
up:
    docker-compose up -d

    docker exec -it hd01 sh /mysh/syncSshKeygen.sh
    docker exec -it hd02 sh /mysh/syncSshKeygen.sh
    docker exec -it hd03 sh /mysh/syncSshKeygen.sh

.PHONY: start
start:
    docker-compose start

.PHONY: down
down:
    docker-compose down

3. docker-compose.yml

version: '3.5'
services:
  hd01:
    image: my-hadoop:3.3.0
    container_name: hd01
    hostname: hd01
    extra_hosts:
      - "hd02:172.24.0.12"
      - "hd03:172.24.0.13"
    networks:
      hd-network:
        ipv4_address: 172.24.0.11
    volumes:
      - ${PWD}/element/configure/hadoop/etc-hd01:/usr/local/hadoop/etc
      - ${PWD}/element/mysh:/mysh
    environment:
      - HDFS_NAMENODE_USER=root
      - HDFS_DATANODE_USER=root
      - HDFS_SECONDARYNAMENODE_USER=root
      - YARN_RESOURCEMANAGER_USER=root
      - YARN_NODEMANAGER_USER=root
  hd02:
    image: my-hadoop:3.3.0
    container_name: hd02
    hostname: hd02
    extra_hosts:
      - "hd01:172.24.0.11"
      - "hd03:172.24.0.13"
    networks:
      hd-network:
        ipv4_address: 172.24.0.12
    volumes:
      - ${PWD}/element/configure/hadoop/etc-hd02:/usr/local/hadoop/etc
      - ${PWD}/element/mysh:/mysh
    environment:
      - HDFS_NAMENODE_USER=root
      - HDFS_DATANODE_USER=root
      - HDFS_SECONDARYNAMENODE_USER=root
      - YARN_RESOURCEMANAGER_USER=root
      - YARN_NODEMANAGER_USER=root
  hd03:
    image: my-hadoop:3.3.0
    container_name: hd03
    hostname: hd03
    extra_hosts:
      - "hd01:172.24.0.11"
      - "hd02:172.24.0.12"
    networks:
      hd-network:
        ipv4_address: 172.24.0.13
    volumes:
      - ${PWD}/element/configure/hadoop/etc-hd03:/usr/local/hadoop/etc
      - ${PWD}/element/mysh:/mysh
    environment:
      - HDFS_NAMENODE_USER=root
      - HDFS_DATANODE_USER=root
      - HDFS_SECONDARYNAMENODE_USER=root
      - YARN_RESOURCEMANAGER_USER=root
      - YARN_NODEMANAGER_USER=root
networks:
  hd-network:
    name: hd-network
    ipam:
      config:
        - subnet: 172.24.0.0/24

4. element/ 目录

  1. element/configure/ 目录

这个目录主要是为了方便后续进行配置修改和共用而映射出来的。
其中重要的文件有 :core-site.xml, hdfs-site.xml, yarn-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <!-- 指定NameNode的地址 -->
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://hd01:8020</value>
    </property>
    <!-- 指定hadoop数据的存储目录 -->
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/usr/local/hadoop/data</value>
    </property>

    <!-- 代理 -->
    <property>
        <name>hadoop.proxyuser.root.hosts</name>
        <value>*</value>
    </property>
    <property>
        <name>hadoop.proxyuser.root.groups</name>
        <value>*</value>
    </property>
</configuration>
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <!-- nn web端访问地址 -->
    <property>
        <name>dfs.namenode.http-address</name>
        <value>hd01:9870</value>
    </property>
    <!-- 2nn web端访问地址 -->
    <property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>hd03:9868</value>
    </property>
</configuration>
<?xml version="1.0"?>
<configuration>
    <!-- 指定MR走shuffle -->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>

    <!-- 指定ResourceManager的地址 -->
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>hd02</value>
    </property>

    <!-- 环境变量的继承 -->
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>

    <!-- 日志聚集 -->
    <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
    </property>

    <!-- 设置日志聚集服务器地址 -->
    <property>
        <name>yarn.log.server.url</name>
        <value>http://hd01:19888/jobhistory/logs</value>
    </property>

    <!-- 设置日志保留时间 -->
    <property>
        <name>yarn.log-aggregation.retain-seconds</name>
        <value>604800</value>
    </property>
</configuration>
  1. element/images/ 目录

hadoop 依赖 java,目前都是依赖jdk1.8。

  1. element/mysh/ 目录

.sh脚本用来做集群运维使用,有很多,后续继续更新文档再写出来。
syncSshKeygen.sh用来进行免密登陆配置,hadoop集群需要服务器间能免密访问。

NETS=("hd01" "hd02" "hd03")
USER="root"
PASSWORD="123456"
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa &> /dev/null
sed -i '/StrictHostKeyChecking/c StrictHostKeyChecking no' /etc/ssh/ssh_config
rpm -q sshpass &> /dev/null || yum -y install sshpass &> /dev/null
for NET in ${NETS[@]}
do
    sshpass -p $PASSWORD ssh-copy-id -i ${USER}@${NET}
done
wait

2. 启动

  1. 在宿主机

[kali] make config up

  1. 初始化 hdfs

[hd01] hdfs namenode -format

  1. 启动 dfs

[hd01] sbin/start-dfs.sh

  1. 启动 yarn

[hd02] sbin/start-yarn.sh

  1. 启动 历史服务器

[hd01] bin/mapred --daemon start historyserver

3. 打开监控页面

Browse Directory:http://hd01:9870/explorer.html#
All Applications: http://hd02:8088/cluster
JobHistory: http://hd01:19888/jobhistory