[01].项目源码

  • 新建一个java项目

18.Wordcount程序测试 - 图1

01.WordCountReduce

  1. import java.io.IOException;
  2. import org.apache.hadoop.io.IntWritable;
  3. import org.apache.hadoop.io.Text;
  4. import org.apache.hadoop.mapreduce.Reducer;
  5. /*
  6. * KEYIN:reduse输入key(是一个单词Text)
  7. * VALUEIN:reduse输入value(是单词的次数IntWritable)
  8. * KEYOUT:reduse输出的key(一个单词Text)
  9. * VALUEOUT:reduse输出的value(单词次数IntWritable)
  10. */
  11. public class WordCountReduce extends Reducer<Text, IntWritable, Text,IntWritable> {
  12. private IntWritable outValue = new IntWritable();
  13. /*
  14. * {key,{1,1,1,1,1}}
  15. * 这里的数据已经被suffer过了!
  16. * {thoughts,{1,1}}
  17. */
  18. @Override
  19. protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
  20. int sum = 0 ;
  21. for(IntWritable value:values) {
  22. sum+=value.get();
  23. }
  24. outValue.set(sum);
  25. context.write(key, outValue);
  26. }
  27. }

02.WordCountMapper

  1. import java.io.IOException;
  2. import org.apache.hadoop.io.IntWritable;
  3. import org.apache.hadoop.io.LongWritable;
  4. import org.apache.hadoop.io.Text;
  5. import org.apache.hadoop.mapreduce.Mapper;
  6. /*
  7. * KEYIN:默认已经拿到的key(行首的偏移量:LongWritable(相当于java的long类型))
  8. *
  9. * VALUEIN:默认分片好的内容(Text类型(相当于string))
  10. *
  11. * KEYOUT:输出的key(Text类型)
  12. *
  13. * VALUEOUT:输出的value(都是1,记作{"word",1},数字类型IntWritable(int))
  14. */
  15. public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
  16. private Text outKey = new Text();
  17. private IntWritable intValue = new IntWritable();
  18. @Override
  19. protected void map(LongWritable key, Text value,Context context)
  20. throws IOException, InterruptedException {
  21. String wordArr[] = value.toString().split("[^a-zA-Z]+");
  22. for(String word:wordArr) {
  23. outKey.set(word);
  24. intValue.set(1);
  25. context.write(outKey, intValue);
  26. }
  27. }
  28. }

03.WordCountDriver

  1. import java.io.IOException;
  2. import org.apache.hadoop.conf.Configuration;
  3. import org.apache.hadoop.fs.Path;
  4. import org.apache.hadoop.io.IntWritable;
  5. import org.apache.hadoop.io.Text;
  6. import org.apache.hadoop.mapreduce.Job;
  7. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  8. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  9. public class WordCountDriver {
  10. public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
  11. // 1.创建job对象
  12. Configuration conf = new Configuration();
  13. Job job = Job.getInstance(conf);
  14. // 2.设置jar路径
  15. job.setJarByClass(WordCountDriver.class);
  16. // 3.关联map与red
  17. job.setMapperClass(WordCountMapper.class);
  18. job.setReducerClass(WordCountReduce.class);
  19. // 4.设置map输出的键值对类型
  20. job.setMapOutputKeyClass(Text.class);
  21. job.setMapOutputValueClass(IntWritable.class);
  22. // 5.设置最终数据输出键值对类型
  23. job.setOutputKeyClass(Text.class);
  24. job.setOutputValueClass(IntWritable.class);
  25. // 6.设置输入路径(FileInputFormat)和输出路径(FileOutputFormat)
  26. FileInputFormat.setInputPaths(job, new Path(args[0]));
  27. FileOutputFormat.setOutputPath(job, new Path(args[1]));
  28. // 7.提交job
  29. boolean result = job.waitForCompletion(true);// true:打印运行信息
  30. System.exit(result ? 0 : 1);// 1:非正常退出
  31. }
  32. }
  • 本地测试的版本

    1. // 6.设置输入路径(FileInputFormat)和输出路径(FileOutputFormat)
    2. FileInputFormat.setInputPaths(job, new Path("D:\\360MoveData\\Users\\AIGameJXB\\Desktop\\Hadoop\\JavaProjectBigData\\src\\cn\\aigamejxb\\hadoop\\mapreduce\\input"));
    3. //输出路径必须是不存在的路径,否则如果它发现存在该目录,那么就直接停止
    4. FileOutputFormat.setOutputPath(job, new Path("D:\\360MoveData\\Users\\AIGameJXB\\Desktop\\Hadoop\\JavaProjectBigData\\src\\cn\\aigamejxb\\hadoop\\mapreduce\\output"));
    5. // 7.提交job
    6. boolean result = job.waitForCompletion(true);//true:打印运行信息
    7. System.exit(result?0:1);//1:非正常退出

    04.log4j.properties

    # priority  :debug<info<warn<error
    #you cannot specify every priority with different file for log4j 
    log4j.rootLogger=debug,stdout,info,debug,warn,error 
    #console
    log4j.appender.stdout=org.apache.log4j.ConsoleAppender 
    log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 
    log4j.appender.stdout.layout.ConversionPattern= [%d{yyyy-MM-dd HH:mm:ss a}]:%p %l%m%n
    #info log
    log4j.logger.info=info
    log4j.appender.info=org.apache.log4j.DailyRollingFileAppender 
    log4j.appender.info.DatePattern='_'yyyy-MM-dd'.log'
    log4j.appender.info.File=./log/info.log
    log4j.appender.info.Append=true
    log4j.appender.info.Threshold=INFO
    log4j.appender.info.layout=org.apache.log4j.PatternLayout 
    log4j.appender.info.layout.ConversionPattern=%m%n
    #debug log
    log4j.logger.debug=debug
    log4j.appender.debug=org.apache.log4j.DailyRollingFileAppender 
    log4j.appender.debug.DatePattern='_'yyyy-MM-dd'.log'
    log4j.appender.debug.File=./log/debug.log
    log4j.appender.debug.Append=true
    log4j.appender.debug.Threshold=DEBUG
    log4j.appender.debug.layout=org.apache.log4j.PatternLayout 
    log4j.appender.debug.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss a} [Thread: %t][ Class:%c >> Method: %l ]%n%p:%m%n
    #warn log
    log4j.logger.warn=warn
    log4j.appender.warn=org.apache.log4j.DailyRollingFileAppender 
    log4j.appender.warn.DatePattern='_'yyyy-MM-dd'.log'
    log4j.appender.warn.File=./log/warn.log
    log4j.appender.warn.Append=true
    log4j.appender.warn.Threshold=WARN
    log4j.appender.warn.layout=org.apache.log4j.PatternLayout 
    log4j.appender.warn.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss a} [Thread: %t][ Class:%c >> Method: %l ]%n%p:%m%n
    #error
    log4j.logger.error=error
    log4j.appender.error = org.apache.log4j.DailyRollingFileAppender
    log4j.appender.error.DatePattern='_'yyyy-MM-dd'.log'
    log4j.appender.error.File = ./log/error.log 
    log4j.appender.error.Append = true
    log4j.appender.error.Threshold = ERROR 
    log4j.appender.error.layout = org.apache.log4j.PatternLayout
    log4j.appender.error.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss a} [Thread: %t][ Class:%c >> Method: %l ]%n%p:%m%n
    

    05.words.txt文本文件

  • 这是我在网上搜的小学生英语作文。。。

    lt was a fine day today. We went to visit the Yakult Milk Factory.
    At nine o’ clock we all meet at the playground. We went there by school bus. When wegot there, we watched a short cartoon.
    lt is the introduction of the Yakult milk.And we also drank a bottle of Yakult milk too. It isso delicious. After that they showed us around the workshops. It is so clean and quiet.Mostof them are machines.
    There are a few workers in this factory. It is the first time for me to see such a mechanicalfactory.
    At noon, we went to the MacDonald' s for lunch.Today was a happy day for me.
    

    [02].环境准备

  • (主节点)安装一个小工具

    yum -y install lrzsz
    #方便使用拖拽方式上传文件(一会上传words.txt)
    
  • 启动hadoop

    [root@master ~]# cd /opt/hadoop-2.6.0/sbin/
    [root@master sbin]# ./start-all.sh
    
  • 创建一个文件

    hadoop fs -mkdir /wordcount
    

    18.Wordcount程序测试 - 图2

  • 拷贝单词文件

18.Wordcount程序测试 - 图3

  • 把文件拖进去

18.Wordcount程序测试 - 图4

  • 上传到hadoop

    hadoop fs -copyFromLocal ./words.txt /wordcount
    

    18.Wordcount程序测试 - 图5

  • 访问后台,证明上传成功

18.Wordcount程序测试 - 图6

[03].打jar包

  • 右键项目

18.Wordcount程序测试 - 图7
18.Wordcount程序测试 - 图8
18.Wordcount程序测试 - 图9
18.Wordcount程序测试 - 图10

  • 直接下一步(默认)

18.Wordcount程序测试 - 图11

  • 打包完后上传jar包

18.Wordcount程序测试 - 图12

[04].测试运行

  • 执行命令:

    hadoop jar hdp.jar /wordcount /output
    

    18.Wordcount程序测试 - 图13

    [05].查看运行结果

    hadoop fs -cat /output/part-r-00000
    

    18.Wordcount程序测试 - 图14

  • 文件内容

18.Wordcount程序测试 - 图15

01.本地测试

18.Wordcount程序测试 - 图16
18.Wordcount程序测试 - 图17
本地测试成功。

02.Hadoop测试

18.Wordcount程序测试 - 图18

  • 查看output中的文件

18.Wordcount程序测试 - 图19

[06].额外测试

01.runanble的方式打的jar包运行在linux的方式

java -jar r.jar ./input ./output
#r.jar是我用runanble的方式打的jar包,直接用jvm来运行
  • ./input是虚拟机中linux的本地路径,里面有个words.txt
  • ./output也是指向虚拟机的linux的本地路径

    02.探索./output/output的区别?

    实际上执行第二遍时,就会报错,会提示你output文件已经存在,我把./output/output路径都分别跑了两遍,打开后台,或者看报错提示都能看到文件路径:
    18.Wordcount程序测试 - 图20
    不妨打开前台看看:
    18.Wordcount程序测试 - 图21
    所以得出结论:

  • ./output路径是/user/root/output

  • /output路径是/output