1. package com.learn.wordcount;
    2. import org.apache.hadoop.io.IntWritable;
    3. import org.apache.hadoop.io.LongWritable;
    4. import org.apache.hadoop.io.Text;
    5. import org.apache.hadoop.mapreduce.Mapper;
    6. import java.io.IOException;
    7. import org.apache.hadoop.conf.Configuration;
    8. import org.apache.hadoop.fs.Path;
    9. import org.apache.hadoop.io.IntWritable;
    10. import org.apache.hadoop.io.Text;
    11. import org.apache.hadoop.mapreduce.Job;
    12. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    13. import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    14. /**
    15. * Mapper<输入key类型, 输入value类型, 输出key类型, 输出value类型>
    16. */
    17. public class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    18. @Override
    19. protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    20. String[] arr = value.toString().split(" ");
    21. for (String s : arr) {
    22. context.write(new Text(s),new IntWritable(1));
    23. }
    24. }
    25. }
    26. /**
    27. * Reducer<Text, IntWritable,Text,IntWritable>
    28. */
    29. public class MyReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
    30. @Override
    31. protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    32. int count = 0;
    33. for (IntWritable value : values) {
    34. count += value.get();
    35. }
    36. context.write(key,new IntWritable(count));
    37. }
    38. }
    39. /**
    40. * Job的配置类
    41. */
    42. public class WordCount extends Configuration {
    43. public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    44. String inputFile = "C:\\Users\\pg\\Desktop\\input.txt";
    45. String inputFile2 = "C:\\Users\\pg\\Desktop\\input2.txt";
    46. String outputFile = "C:\\Users\\pg\\Desktop\\out.txt";
    47. //获取Job对象,组装8个步骤所需要的类
    48. Configuration conf = new Configuration();
    49. Job job = Job.getInstance(conf, "WordCount");
    50. job.setJarByClass(WordCount.class);
    51. //第一步:设置读取文件的InputFormat,解析成key,value对,k1、v1
    52. job.setInputFormatClass(TextInputFormat.class);
    53. TextInputFormat.addInputPath(job,new Path(inputFile,inputFile2));
    54. //第二步:自定义map逻辑,接收kv对进行处理输出新的kv对,k2、v2
    55. job.setMapperClass(MyMapper.class);
    56. //第三步:分区,相同key(k2)的数据发往同一个reduce里面去,key进行合并,value形成一个集合
    57. job.setPartitionerClass(HashPartitioner.class);
    58. job.setNumReduceTasks(2);//设置reduceTask数量
    59. //第四步:排序,对k2进行排序,按照字典排序
    60. //第五步:规约,combiner过程,调优步骤,可选
    61. job.setCombinerClass(MyReducer.class);
    62. //第六步:分组
    63. job.setGroupingComparatorClass(WritableComparator.class);
    64. //TODO 这四步是有默认的策略
    65. //第七步:自定义reducer逻辑接收k2、v2,转换成新的k3、v3输出
    66. job.setReducerClass(MyReducer.class);
    67. job.setOutputKeyClass(Text.class);
    68. job.setOutputValueClass(IntWritable.class);
    69. //第八步:输出k3、v3进行保存
    70. job.setOutputFormatClass(TextOutputFormat.class);
    71. TextOutputFormat.setOutputPath(job,new Path(outputFile));
    72. //第九步:结束job
    73. System.exit(job.waitForCompletion(true) ? 0 : 1);
    74. }
    75. }

    pom.xml

    1. <?xml version="1.0" encoding="UTF-8"?>
    2. <project xmlns="http://maven.apache.org/POM/4.0.0"
    3. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    4. xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    5. <modelVersion>4.0.0</modelVersion>
    6. <groupId>com.learn</groupId>
    7. <artifactId>HadoopLearn</artifactId>
    8. <version>1.0-SNAPSHOT</version>
    9. <dependencies>
    10. <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
    11. <dependency>
    12. <groupId>org.apache.hadoop</groupId>
    13. <artifactId>hadoop-common</artifactId>
    14. <version>2.7.6</version>
    15. </dependency>
    16. <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-client -->
    17. <dependency>
    18. <groupId>org.apache.hadoop</groupId>
    19. <artifactId>hadoop-client</artifactId>
    20. <version>2.7.6</version>
    21. </dependency>
    22. <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
    23. <dependency>
    24. <groupId>org.apache.hadoop</groupId>
    25. <artifactId>hadoop-hdfs</artifactId>
    26. <version>2.7.6</version>
    27. <scope>test</scope>
    28. </dependency>
    29. <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
    30. <dependency>
    31. <groupId>org.apache.hadoop</groupId>
    32. <artifactId>hadoop-mapreduce-client-core</artifactId>
    33. <version>2.7.6</version>
    34. </dependency>
    35. <!-- https://mvnrepository.com/artifact/org.apache.logging.log4j/log4j-core -->
    36. <dependency>
    37. <groupId>org.apache.logging.log4j</groupId>
    38. <artifactId>log4j-core</artifactId>
    39. <version>2.8</version>
    40. </dependency>
    41. </dependencies>
    42. <build>
    43. <plugins>
    44. <!-- 该插件可以生成一个带依赖和不带依赖的jar包 -->
    45. <plugin>
    46. <groupId>org.apache.maven.plugins</groupId>
    47. <artifactId>maven-shade-plugin</artifactId>
    48. <version>3.2.0</version>
    49. <configuration>
    50. <createDependencyReducedPom>true</createDependencyReducedPom>
    51. </configuration>
    52. <executions>
    53. <execution>
    54. <phase>package</phase>
    55. <goals>
    56. <goal>shade</goal>
    57. </goals>
    58. <!--<configuration>
    59. <transformers>
    60. <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
    61. &lt;!&ndash; 主类的全限定名 &ndash;&gt;
    62. <mainClass>com.healchow.consumer.Main</mainClass>
    63. </transformer>
    64. </transformers>
    65. </configuration>-->
    66. </execution>
    67. </executions>
    68. </plugin>
    69. <!-- 该插件会把jar放到dependency目录 -->
    70. <plugin>
    71. <groupId>org.apache.maven.plugins</groupId>
    72. <artifactId>maven-dependency-plugin</artifactId>
    73. <executions>
    74. <execution>
    75. <id>copy</id>
    76. <phase>package</phase>
    77. <goals>
    78. <goal>copy-dependencies</goal>
    79. </goals>
    80. <!--<configuration>-->
    81. <!--<outputDirectory>${project.build.directory}/lib-->
    82. <!--</outputDirectory>-->
    83. <!--</configuration>-->
    84. </execution>
    85. </executions>
    86. </plugin>
    87. <plugin>
    88. <groupId>org.apache.maven.plugins</groupId>
    89. <artifactId>maven-compiler-plugin</artifactId>
    90. <version>3.8.0</version>
    91. <configuration>
    92. <source>1.8</source>
    93. <target>1.8</target>
    94. <encoding>UTF-8</encoding>
    95. </configuration>
    96. </plugin>
    97. </plugins>
    98. </build>
    99. </project>

    日志:

    log4j.rootLogger = debug,stdout
    
    ### 输出信息到控制抬 ###
    log4j.appender.stdout = org.apache.log4j.ConsoleAppender
    log4j.appender.stdout.Target = System.out
    log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
    log4j.appender.stdout.layout.ConversionPattern = [%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n