1. Hadoop HDFS
Hadoop HDFS 分布式文件系统
1.1 常用命令
hadoop fs -mkdir # 创建 HDFS 目录
hadoop fs -ls # 列出 HDFS 目录
hadoop fs -copyFromLocal # 本地复制
hadoop fs -put # 复制到 HDFS
hadoop fs -cat # 列出 HDFS 下文件
hadoop fs -copyToLocal # 复制到本地
hadoop fs -get # 复制到本地
hadoop fs -cp # 复制
hadoop fs -rm # 删除
1.2 目录
# 创建 user 目录
hadoop fs -mkdir /user
# user 下创建 test 子目录
hadoop fs -mkdir /user/test
# 删除目录
hadoop fs -rm -r /user/test
# 查看 HDFS 目录 / 下所有目录
hadoop fs -ls /
# 查看 HDFS 目录/ 下所有内容
hadoop fs -ls -r /
1.3 文件
# 复制本地文件到 HDFS 目录,参数 -f 强制复制
hadoop fs -copyFromLocal test.txt /user/test
hadoop fs -put test.txt /user/test/test.txt
# 显示文件内容
hadoop fs -cat /user/test/test.txt
hadoop fs -cat /user/test/test.txt|more
# HDFS 文件复制到本地
hadoop fs -copyToLocal /user/test/test.txt
# 复制文件到本地
hadoop fs -get /user/test/test.txt
hadoop fs -cp /user/test/test.txt /user/test/temp.txt
# 删除文件
hadoop fs -rm /user/test/test.txt
2. Hadoop MapReduce
通过 MapReduce 来使大量服务器并行处理,Map 进行数据切割,Reduce 进行数据合并
2.1 创建 WordCount.java
- 创建目录
mkdir -p ~/wordcount/input
cd ~/wordcount
- 访问 MapReduce Tutorial 官方文档 , 复制 WordCount.java 源码
sudo vim WordCount.java
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
- 编译 WordCount.java
# 编译
javac -classpath `${HADOOP_HOME}/bin/hadoop classpath` WordCount.java
# 打包
jar cf wordcount.jar WordCount*.class
# 查看
ll
2.2 测试
- 使用 Hadoop 下 LICENSE.txt 进行测试
cp /usr/local/hadoop/LICENSE.txt ~/wordcount/input
- HDFS 上传 LICENSE.txt
hadoop fs -mkdir -p /wordcount/input
cd ~/wordcount/input
hadoop fs -copyFromLocal LICENSE.txt /wordcount/input
- 运行 WordCount.java
cd ~/wordcount
hadoop jar wordcount.jar WordCount /wordcount/input/LICENSE.txt /wordcount/output/
出现报错,检查
<HADOOP_HOME>/etc/hadoop/mapred-site.xml
是否有如下配置
<HADOOP_HOME>/etc/hadoop/mapred-site.xml contains the below configuration:
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
- 查看运行结果
- 查看 HDFS 目录
hadoop fs -ls /wordcount/output
,_SUCCESS
表示运行成功,part-r-00000
为生成结果
-rw-r--r-- 3 root supergroup 0 2022-02-07 20:22 /wordcount/output/_SUCCESS
-rw-r--r-- 3 root supergroup 9894 2022-02-07 20:22 /wordcount/output/part-r-00000
- HDFS 查看
hadoop fs -cat /wordcount/output/part-r-00000|more
- 注意事项:
- 再次执行 WordCount, 需删除输出目录,以免出现报错信息