下载安装包

Hadoop

Linux环境部署

创建账户

添加用户名和密码

  1. [root@iZnm201imn18dkgebcpx40Z ~]# useradd hadoop
  2. [root@iZnm201imn18dkgebcpx40Z ~]# passwd hadoop
  3. Changing password for user hadoop.
  4. New password:
  5. Retype new password:
  6. passwd: all authentication tokens updated successfully.
  7. A&UAqqGqk55M9hPnvhL0EK
  8. Awk3fh#x4RyAMHwQ

配置hadoop用户具有root权限,方便后期加sudo执行root权限的命令

  1. [root@iZnm201imn18dkgebcpx40Z ~]# vim /etc/sudoers

修改/etc/sudoers文件,在%wheel下添加一行,hadoop ALL=(ALL) NOPASSWD:ALL

  1. ## Allow root to run any commands anywhere
  2. root ALL=(ALL) ALL
  3. ## Allows people in group wheel to run all commands
  4. %wheel ALL=(ALL) ALL
  5. hadoop ALL=(ALL) NOPASSWD:ALL

注意:hadoop这一行不要直接放到root行下面,因为所有用户都属于wheel组,你先配置了hadoop具有免密功能,但是程序执行到%wheel行时,该功能又被覆盖回需要密码。所以hadoop要放到%wheel这行下面

修改HostName

vi /etc/hostname
查看:sysctl kernel.hostname
修改:sudo sysctl kernel.hostname=hadoop-node1
创建/opt/module、/opt/software文件夹,并修改文件所有者和所属组为hadoop用户
编辑hosts
vi /etc/hosts
编辑DNS
vi /etc/resolv.conf

更改yum源,root
源:网易163 http://mirrors.163.com/.help/
中科大 https://mirrors.ustc.edu.cn/help/
sohu http://mirrors.sohu.com/help/
阿里云 https://opsx.alibaba.com/mirror
清华大学 https://mirrors.tuna.tsinghua.edu.cn/
浙江大学 http://mirrors.zju.edu.cn/
中国科技大学 http://centos.ustc.edu.cn/
sudo sed ‘s/http:\/\/yum.tbsite.net\/centos/https:\/\/mirrors.ustc.edu.cn\/centos/g’ -i /etc/yum.repos.d/CentOS-Base.repo

sudo sed ‘s/https:\/\/mirrors.ustc.edu.cn\/centos/https:\/\/mirrors.aliyun.com\/centos/g’ -i /etc/yum.repos.d/CentOS-Base.repo

mv /etc/yum.repos.d /etc/yum.repos.d.bak
mkdir /etc/yum.repos.d
vim /etc/yum.repos.d/CentOS-Base.repo

  1. [base]
  2. name=CentOS-$releasever - Base
  3. failovermethod=priority
  4. baseurl=https://mirrors.ustc.edu.cn/centos/$releasever/os/$basearch/
  5. gpgcheck=1
  6. gpgkey=https://mirrors.ustc.edu.cn/centos/RPM-GPG-KEY-CentOS-7
  7. #released updates
  8. [updates]
  9. name=CentOS-$releasever - Updates
  10. failovermethod=priority
  11. baseurl=https://mirrors.ustc.edu.cn/centos/$releasever/updates/$basearch/
  12. gpgcheck=1
  13. gpgkey=https://mirrors.ustc.edu.cn/centos/RPM-GPG-KEY-CentOS-7
  14. #additional packages that may be useful
  15. [extras]
  16. name=CentOS-$releasever - Extras
  17. failovermethod=priority
  18. baseurl=https://mirrors.ustc.edu.cn/centos/$releasever/extras/$basearch/
  19. gpgcheck=1
  20. gpgkey=https://mirrors.ustc.edu.cn/centos/RPM-GPG-KEY-CentOS-7
  21. #additional packages that extend functionality of existing packages
  22. [centosplus]
  23. name=CentOS-$releasever - Plus
  24. failovermethod=priority
  25. baseurl=https://mirrors.ustc.edu.cn/centos/$releasever/centosplus/$basearch/
  26. gpgcheck=1
  27. enabled=0
  28. gpgkey=https://mirrors.ustc.edu.cn/centos/RPM-GPG-KEY-CentOS-7
  29. #contrib - packages by Centos Users
  30. [contrib]
  31. name=CentOS-$releasever - Contrib
  32. failovermethod=priority
  33. baseurl=https://mirrors.ustc.edu.cn/centos/$releasever/contrib/$basearch/
  34. gpgcheck=1
  35. enabled=0
  36. gpgkey=https://mirrors.ustc.edu.cn/centos/RPM-GPG-KEY-CentOS-7

yum clean all
yum makecache

  1. [root@iZnm201imn18dkgebcpx40Z ~]# mkdir /opt/module
  2. [root@iZnm201imn18dkgebcpx40Z ~]# mkdir /opt/software
  3. [root@iZnm201imn18dkgebcpx40Z ~]# chown hadoop:hadoop /opt/module
  4. [root@iZnm201imn18dkgebcpx40Z ~]# chown hadoop:hadoop /opt/software/

集群分发脚本

  1. #!/bin/bash
  2. #1. 判断参数个数
  3. if [ $# -lt 1 ]
  4. then
  5. echo Not Enough Arguement!
  6. exit;
  7. fi
  8. #2. 遍历集群所有机器
  9. for host in hadoop-node1 hadoop-node2 hadoop-node3
  10. do
  11. echo ==================== $host ====================
  12. #3. 遍历所有目录,挨个发送
  13. for file in $@
  14. do
  15. #4 判断文件是否存在
  16. if [ -e $file ]
  17. then
  18. #5. 获取父目录
  19. pdir=$(cd -P $(dirname $file); pwd)
  20. #6. 获取当前文件的名称
  21. fname=$(basename $file)
  22. ssh $host "mkdir -p $pdir"
  23. rsync -av $pdir/$fname $host:$pdir
  24. else
  25. echo $file does not exists!
  26. fi
  27. done
  28. done

安装Java环境

卸载自带JDK(如果有的话)

  1. [root@iZnm201imn18dkgebcpx40Z ~]# rpm -qa | grep -i java | xargs -n1 rpm -e --nodeps
  2. rpm: no packages given for erase
  • rpm -qa:查询所安装的所有rpm软件包
  • grep -i:忽略大小写
  • xargs -n1:表示每次只传递一个参数
  • rpm -e –nodeps:强制卸载软件

    安装JDK

    ```bash

    下载

    [root@iZnm201imn18dkgebcpx40Z ~]# cd /opt/software

    切换到hadoop用户

    [root@iZnm201imn18dkgebcpx40Z software]# su hadoop

[hadoop@iZnm201imn18dkgebcpx40Z software]# wget https://repo.huaweicloud.com/java/jdk/8u152-b16/jdk-8u152-linux-x64.tar.gz

解压到/opt/module文件夹

[hadoop@iZnm201imn18dkgebcpx40Z software]# tar -zxvf jdk-8u152-linux-x64.tar.gz -C /opt/module

  1. <a name="JqePL"></a>
  2. ### 配置JDK环境变量
  3. 1. 新建/etc/profile.d/my_env.sh文件
  4. ```bash
  5. sudo vim /etc/profile.d/my_env.sh
  6. #JAVA_HOME
  7. export JAVA_HOME=/opt/module/jdk1.8.0
  8. export PATH=$PATH:$JAVA_HOME/bin
  1. 使环境变量生效

    source /etc/profile.d/my_env.sh
    
  2. 测试 java -version

    [hadoop@iZnm201imn18dkgebcpx40Z module]# java -version
    java version "1.8.0_152"
    Java(TM) SE Runtime Environment (build 1.8.0_152-b16)
    Java HotSpot(TM) 64-Bit Server VM (build 25.152-b16, mixed mode)
    

    安装Hadoop

    下载并解压

    # 下载
    [hadoop@iZnm201imn18dkgebcpx40Z ~]# cd /opt/software
    [hadoop@iZnm201imn18dkgebcpx40Z software]# wget https://repo.huaweicloud.com/apache/hadoop/common/hadoop-3.1.3/hadoop-3.1.3.tar.gz
    # 解压到/opt/module文件夹
    [hadoop@iZnm201imn18dkgebcpx40Z software]# tar -zxvf hadoop-3.1.3.tar.gz -C /opt/module
    

    配置Hadoop环境变量

  3. 在/etc/profile.d/my_env.sh文件中添加 ```bash vim /etc/profile.d/my_env.sh

HADOOP_HOME

export HADOOP_HOME=/opt/module/hadoop-3.1.3 export PATH=$PATH:$HADOOP_HOME/bin export PATH=$PATH:$HADOOP_HOME/sbin


2. 使环境变量生效
```bash
source /etc/profile.d/my_env.sh

Hadoop单机部署配置

伪分布式部署

  1. 配置核心文件core-site.xml ```bash [hadoop@iZnm201imn18dkgebcpx40Z hadoop]$ cd $HADOOP_HOME/etc/hadoop [hadoop@iZnm201imn18dkgebcpx40Z hadoop]$ vim core-site.xml
fs.defaultFS hdfs://localhost:8020 hadoop.tmp.dir /opt/module/hadoop-3.1.3/data hadoop.http.staticuser.user hadoop hadoop.proxyuser.hadoop.hosts hadoop.proxyuser.hadoop.groups hadoop.proxyuser.hadoop.users *

2. 配置HDFS文件hdsf-site.xml
```bash
vim hdsf-site.xml
# 添加如下信息
<configuration>
      <!-- 测试环境指定HDFS副本的数量1 -->
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
</configuration>
  1. 初始化namenode

    [hadoop@iZnm201imn18dkgebcpx40Z hadoop-3.1.3]$ pwd
    /opt/module/hadoop-3.1.3
    [hadoop@iZnm201imn18dkgebcpx40Z hadoop-3.1.3]$ bin/hdfs namenode -format
    
  2. 启动

    sbin/start-dfs.sh
    

    启动报错了,提示意思是没有免密登录ssh localhost的权限:

    [hadoop@iZnm201imn18dkgebcpx40Z hadoop-3.1.3]$ sbin/start-dfs.sh
    Starting namenodes on [iZnm201imn18dkgebcpx40Z]
    iZnm201imn18dkgebcpx40Z: Warning: Permanently added 'iznm201imn18dkgebcpx40z,10.10.178.233' (ECDSA) to the list of known hosts.
    iZnm201imn18dkgebcpx40Z: Permission denied (publickey,gssapi-keyex,gssapi-with-mic,password).
    Starting datanodes
    localhost: Permission denied (publickey,gssapi-keyex,gssapi-with-mic,password).
    Starting secondary namenodes [iZnm201imn18dkgebcpx40Z]
    iZnm201imn18dkgebcpx40Z: Permission denied (publickey,gssapi-keyex,gssapi-with-mic,password).
    [hadoop@iZnm201imn18dkgebcpx40Z hadoop-3.1.3]$ jps
    28245 Jps
    

    设置免密登录后,再次执行命令sbin/start-dfs.sh

    ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
    chmod 0600 ~/.ssh/authorized_keys
    

    测试是否启动,jps

    [hadoop@iZnm201imn18dkgebcpx40Z hadoop-3.1.3]$ jps
    29441 DataNode
    33993 Jps
    29773 SecondaryNameNode
    29279 NameNode
    [hadoop@iZnm201imn18dkgebcpx40Z hadoop-3.1.3]$
    
  3. 配置MapReduce job on YARN

配置mapreduce由YARN vim etc/hadoop/mapred-site.xml

<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
        <name>mapreduce.application.classpath</name>
        <value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
    </property>
</configuration>
vim etc/hadoop/yarn-site.xml
<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,PATH,LANG,TZ,HADOOP_MAPRED_HOME</value>
    </property>
</configuration>
启动yarn
sbin/start-yarn.sh

集群重启

# 1.先关闭yarn
sbin/stop-yarn.sh
# 2.关闭dfs
sbin/stop-dfs.sh

# 3.重新启动dfs
sbin/start-dfs.sh
# 4.重启yarn
sbin/start-yarn.sh

附录:Hadoop的启动和停止说明

sbin/start-all.sh #启动所有的Hadoop守护进程。包括NameNode、 Secondary NameNode、DataNode、ResourceManager、NodeManager

sbin/stop-all.sh #停止所有的Hadoop守护进程。包括NameNode、 Secondary NameNode、DataNode、ResourceManager、NodeManager

sbin/start-dfs.sh #启动Hadoop HDFS守护进程NameNode、SecondaryNameNode、DataNode

sbin/stop-dfs.sh #停止Hadoop HDFS守护进程NameNode、SecondaryNameNode和DataNode

sbin/hadoop-daemons.sh start namenode #单独启动NameNode守护进程

sbin/hadoop-daemons.sh stop namenode #单独停止NameNode守护进程

sbin/hadoop-daemons.sh start datanode #单独启动DataNode守护进程

sbin/hadoop-daemons.sh stop datanode #单独停止DataNode守护进程

sbin/hadoop-daemons.sh start secondarynamenode #单独启动SecondaryNameNode守护进程

sbin/hadoop-daemons.sh stop secondarynamenode #单独停止SecondaryNameNode守护进程

sbin/start-yarn.sh #启动ResourceManager、NodeManager

sbin/stop-yarn.sh #停止ResourceManager、NodeManager

sbin/yarn-daemon.sh start resourcemanager #单独启动ResourceManager

sbin/yarn-daemons.sh start nodemanager  #单独启动NodeManager

sbin/yarn-daemon.sh stop resourcemanager #单独停止ResourceManager

sbin/yarn-daemons.sh stopnodemanager  #单独停止NodeManager

sbin/mr-jobhistory-daemon.sh start historyserver #手动启动jobhistory

sbin/mr-jobhistory-daemon.sh stop historyserver #手动停止jobhistory