hadoop
HOME
#查看主机名hostname#修改主机名sudo hostnamectl set-hostname linux01# 关闭防火墙,关闭防火墙开机自启systemctl stop firewalldsystemctl disable firewalld.service#在/etc/profile.d文件下新建文件my_home.sh,把下面的写入文件中export HADOOP_HOME=/opt/hadoop-3.1.3export PATH=$PATH:$HADOOP_HOME/binexport PATH=$PATH:$HADOOP_HOME/sbinexport JAVA_LIBRARY_PATH=$PATH:$HADOOP_HOME/lib/nativeexport JAVA_HOME=/opt/jdk1.8.0_321export PATH=$PATH:$JAVA_HOME/binexport SCALA_HOME=/opt/scala-2.12.16export PATH=$PATH:$SCALA_HOME/binexport SBT_HOME=/opt/sbtexport PATH=$PATH:$SBT_HOME/binexport SPARK_HOME=/opt/spark-yarnexport H=$PATH:$SPARK_HOME/binsource /etc/profile
#查看主机名hostname#修改主机名sudo hostnamectl set-hostname linux01cat /etc/sysconfig/network-scripts/ifcfg-ens33cat > /etc/sysconfig/network-scripts/ifcfg-ens33DEVICE=ens33TYPE=EthernetONBOOT=yesBOOTPROTO=staticNAME="ens33"IPADDR=192.168.10.100PREFIX=24GATEWAY=192.168.10.2DNS1=192.168.10.2# 重启rebootcat >/etc/hosts192.168.10.100 linux00192.168.10.101 linux01192.168.10.102 linux02192.168.10.103 linux03192.168.10.104 linux04192.168.10.105 linux05192.168.10.106 linux06192.168.10.107 linux07192.168.10.108 linux08192.168.10.109 linux09
ifconfig ens33 upsystemctl stop NetworkManagerifup ens33systemctl restart network.serviceifconfig
#!/bin/bashecho "=========================================="java -versionecho "=========================================="hadoop versionecho "=========================================="scala -versionecho "=========================================="sbt --versionecho "=========================================="$SPARK_HOME/bin/spark-shell
免密登录
ssh-keygen -t rsassh-copy-id linux02ssh-copy-id linux03ssh-copy-id linux04
分发脚本
chmod 777 /bin/sync
#!/bin/bash#1. 判断参数个数if [ $# -lt 1 ]thenecho Not Enough Arguement!exit;fi#2. 遍历集群所有机器for host in linux02 linux03 linux04doecho ==================== $host ====================#3. 遍历所有目录,挨个发送for file in $@do#4. 判断文件是否存在if [ -e $file ]then#5. 获取父目录pdir=$(cd -P $(dirname $file); pwd)#6. 获取当前文件的名称fname=$(basename $file)ssh $host "mkdir -p $pdir"rsync -av $pdir/$fname $host:$pdirelseecho $file does not exists!fidonedone
集群部署规划
NameNode和SecondaryNameNode不要安装在同一台服务器 ResourceManager也很消耗内存,不要和NameNode、SecondaryNameNode配置在同一台机器上。
| linux02 | linux03 | linux04 | |
|---|---|---|---|
| HDFS |
NameNode DataNode JobHistoryServer |
DataNode |
SecondaryNameNode DataNode |
| YARN | NodeManager |
ResourceManager NodeManager |
NodeManager |
conf
/opt/hadoop-3.1.3/etc/hadoop
<configuration><!-- 指定 NameNode 的地址 --><property><name>fs.defaultFS</name><value>hdfs://linux02:8020</value></property><!-- 指定 hadoop 数据的存储目录 --><property><name>hadoop.tmp.dir</name><value>/opt/hadoop-3.1.3/data</value></property><!-- 配置 HDFS 网页登录使用的静态用户为 root --><property><name>hadoop.http.staticuser.user</name><value>root</value></property></configuration>
<configuration>
<!-- nn web 端访问地址-->
<property>
<name>dfs.namenode.http-address</name>
<value>linux02:9870</value>
</property>
<!-- 2nn web 端访问地址-->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>linux04:9868</value>
</property>
</configuration>
<configuration>
<!-- 指定 MR 走 shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 指定 ResourceManager 的地址-->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>linux03</value>
</property>
<!-- 环境变量的继承 -->
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CO
NF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAP
RED_HOME</value>
</property>
<!-- 开启日志聚集功能 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 设置日志聚集服务器地址 -->
<property>
<name>yarn.log.server.url</name>
<value>http://linux02:19888/jobhistory/logs</value>
</property>
<!-- 设置日志保留时间为7天 -->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
</configuration>
<configuration>
<!-- 指定 MapReduce 程序运行在 Yarn 上 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 历史服务器端地址 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>linux02:10020</value>
</property>
<!-- 历史服务器web端地址 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>linux02:19888</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop-3.1.3</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop-3.1.3</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop-3.1.3</value>
</property>
</configuration>
linux02
linux03
linux04
初始化
hdfs namenode -format
启动脚本
#!/bin/bash
if [ $# -lt 1 ]
then
echo "No Args Input..."
exit ;
fi
case $1 in
"start")
echo " =================== 启动 hadoop集群 ==================="
echo " --------------- 启动 hdfs ---------------"
ssh linux02 "/opt/hadoop-3.1.3/sbin/start-dfs.sh"
echo " --------------- 启动 yarn ---------------"
ssh linux03 "/opt/hadoop-3.1.3/sbin/start-yarn.sh"
echo " --------------- 启动 historyserver ---------------"
ssh linux02 "/opt/hadoop-3.1.3/bin/mapred --daemon start historyserver"
;;
"stop")
echo " =================== 关闭 hadoop集群 ==================="
echo " --------------- 关闭 historyserver ---------------"
ssh linux02 "/opt/hadoop-3.1.3/bin/mapred --daemon stop historyserver"
echo " --------------- 关闭 yarn ---------------"
ssh linux03 "/opt/hadoop-3.1.3/sbin/stop-yarn.sh"
echo " --------------- 关闭 hdfs ---------------"
ssh linux02 "/opt/hadoop-3.1.3/sbin/stop-dfs.sh"
;;
*)
echo "Input Args Error..."
;;
esac
查看所有进程脚本
#!/bin/bash
for host in linux02 linux03 linux04
do
echo =============== $host ===============
ssh $host jps
done
格式化Hadoop
先停进程
在删除data logs
再格式化 hdfs namenode -format
在启动集群
test hadoop
# 创建文件夹
hadoop fs -mkdir /myinput
hadoop fs -put wcinput/word.txt /myinput
hadoop fs -put 文件路径 hdfs中的路径
hadoop fs -put jdk-8u301-linux-x64.rpm /myinput
hadoop jar /opt/hadoop-3.1.3/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.3.jar wordcount /myinput /output1
spark
local env
sc.textFile("data/word.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect
bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master local[*] \
./examples/jars/spark-examples_2.12-3.0.0.jar \
10
yarn env
./bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn \
--deploy-mode client \
./examples/jars/spark-examples_2.12-3.0.0.jar 10
# 启动Spark 集群
sbin/start-all.sh
# 启动Spark 历史服务器
sbin/start-history-server.sh
$SPARK_HOME/bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn \
--properties-file /home/job1/conf.conf \
$SPARK_HOME/examples/jars/spark-examples_2.12-3.0.0.jar 10
