1. Hadoop HDFS
Hadoop HDFS 分布式文件系统
1.1 常用命令
hadoop fs -mkdir # 创建 HDFS 目录hadoop fs -ls # 列出 HDFS 目录hadoop fs -copyFromLocal # 本地复制hadoop fs -put # 复制到 HDFShadoop fs -cat # 列出 HDFS 下文件hadoop fs -copyToLocal # 复制到本地hadoop fs -get # 复制到本地hadoop fs -cp # 复制hadoop fs -rm # 删除
1.2 目录
# 创建 user 目录hadoop fs -mkdir /user# user 下创建 test 子目录hadoop fs -mkdir /user/test# 删除目录hadoop fs -rm -r /user/test# 查看 HDFS 目录 / 下所有目录hadoop fs -ls /# 查看 HDFS 目录/ 下所有内容hadoop fs -ls -r /
1.3 文件
# 复制本地文件到 HDFS 目录,参数 -f 强制复制hadoop fs -copyFromLocal test.txt /user/testhadoop fs -put test.txt /user/test/test.txt# 显示文件内容hadoop fs -cat /user/test/test.txthadoop fs -cat /user/test/test.txt|more# HDFS 文件复制到本地hadoop fs -copyToLocal /user/test/test.txt# 复制文件到本地hadoop fs -get /user/test/test.txthadoop fs -cp /user/test/test.txt /user/test/temp.txt# 删除文件hadoop fs -rm /user/test/test.txt
2. Hadoop MapReduce
通过 MapReduce 来使大量服务器并行处理,Map 进行数据切割,Reduce 进行数据合并
2.1 创建 WordCount.java
- 创建目录
mkdir -p ~/wordcount/inputcd ~/wordcount
- 访问 MapReduce Tutorial 官方文档 , 复制 WordCount.java 源码
sudo vim WordCount.javaimport java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class WordCount {public static class TokenizerMapperextends Mapper<Object, Text, Text, IntWritable>{private final static IntWritable one = new IntWritable(1);private Text word = new Text();public void map(Object key, Text value, Context context) throws IOException, InterruptedException {StringTokenizer itr = new StringTokenizer(value.toString());while (itr.hasMoreTokens()) {word.set(itr.nextToken());context.write(word, one);}}}public static class IntSumReducerextends Reducer<Text,IntWritable,Text,IntWritable> {private IntWritable result = new IntWritable();public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) {sum += val.get();}result.set(sum);context.write(key, result);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf, "word count");job.setJarByClass(WordCount.class);job.setMapperClass(TokenizerMapper.class);job.setCombinerClass(IntSumReducer.class);job.setReducerClass(IntSumReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}}
- 编译 WordCount.java
# 编译javac -classpath `${HADOOP_HOME}/bin/hadoop classpath` WordCount.java# 打包jar cf wordcount.jar WordCount*.class# 查看ll
2.2 测试
- 使用 Hadoop 下 LICENSE.txt 进行测试
cp /usr/local/hadoop/LICENSE.txt ~/wordcount/input
- HDFS 上传 LICENSE.txt
hadoop fs -mkdir -p /wordcount/inputcd ~/wordcount/inputhadoop fs -copyFromLocal LICENSE.txt /wordcount/input
- 运行 WordCount.java
cd ~/wordcounthadoop jar wordcount.jar WordCount /wordcount/input/LICENSE.txt /wordcount/output/
出现报错,检查
<HADOOP_HOME>/etc/hadoop/mapred-site.xml是否有如下配置
<HADOOP_HOME>/etc/hadoop/mapred-site.xml contains the below configuration:<property><name>yarn.app.mapreduce.am.env</name><value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value></property><property><name>mapreduce.map.env</name><value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value></property><property><name>mapreduce.reduce.env</name><value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value></property>
- 查看运行结果
- 查看 HDFS 目录
hadoop fs -ls /wordcount/output,_SUCCESS表示运行成功,part-r-00000为生成结果
-rw-r--r-- 3 root supergroup 0 2022-02-07 20:22 /wordcount/output/_SUCCESS-rw-r--r-- 3 root supergroup 9894 2022-02-07 20:22 /wordcount/output/part-r-00000
- HDFS 查看
hadoop fs -cat /wordcount/output/part-r-00000|more
- 注意事项:
- 再次执行 WordCount, 需删除输出目录,以免出现报错信息
