wordCount &简易知识点
总体概述
MapReduce 总共分为map和reduce 一个分一个合 本次主要记录代码的含义和书写
我是写在一个文件夹里面进行的 并没有分成三个文件夹 接下来我会粘贴三部分得代码 和总体代码
总体三层
map
public static class MapperDemo extends Mapper<LongWritable,Text, Text, IntWritable>{
/**
* 声明MapperDemo类 继承 org.apache.hadoop.mapreduce.Mapper; 必须继承这个类
* 继承这个类需要输入泛型 <KEYIN, VALUEIN, KEYOUT, VALUEOUT>
* 这个案列分别是
* KEYIN 编号 int or long 注意java和mapreduce有区别 不能直接给int和 Long
应该是 IntWritable or IntWritable
* VALUEIN 一行数据 string 应该给 Text
* KEYOUT 应该是一个单词 Text
* VALUEOUT 1 IntWritable
*/
// v2
private final IntWritable one = new IntWritable(1);
// 声明Text对象 用于存放key2
Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
// 声明一个分词对象 传入参数 string:一行数据(value.toString) dim: "," 按照什么分割
StringTokenizer tokenizer = new StringTokenizer(value.toString(), ",");
// 分割完成就是一个可迭代得 通过while循环 构建k2 v2
while (tokenizer.hasMoreTokens()){
// 分词出来得单词转换成Text
this.text.set(tokenizer.nextToken());
// 传入上下文对象 比如 "hello, 1"
context.write(text, one);
}
}
}
reduce
public static class ReduceDemo extends Reducer<Text, IntWritable, Text, IntWritable>{
/**
* 此功能和map很像 继承得类变成了 org.apache.hadoop.mapreduce.Reducer;
* 也需要实现泛型 <> k2 v2 k3 v3
* 示例 reduce方法首先会把同一个k得value放在一起 比如hello <1,1,1,1,>
最后得出得结果就是 hello, 4
*/
private IntWritable count = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
// 用于计数
int sum = 0;
// 循环迭代 获取value
for (IntWritable value : values) {
// .get() 获取方法
sum += value.get();
count.set(sum);
}
// 放入上下文中
context.write(key, count);
}
}
job提交main
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
System.setProperty("hadoop.home.dir", "H:\\winutils\\winutils-master\\hadoop-2.6.0");
// Configuration
Configuration conf = new Configuration();
// 生成job
Job job = Job.getInstance(conf, "word count");
// 指定打包类
job.setJarByClass(JobMainTwo.class);
// 指定map类
job.setMapperClass(JobMainTwo.MapperDemo.class);
// 输出得key 和 value 数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setCombinerClass(JobMainTwo.ReduceDemo.class);
// 指定reduce类
job.setReducerClass(JobMainTwo.ReduceDemo.class);
// 输出得key 和 value 数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 读取文件的路径
FileInputFormat.addInputPath(job,new Path("data/java/hello.txt"));
// 输出文件得路径
FileOutputFormat.setOutputPath(job,new Path("data/java/hello-put2"));
// 输出运行结果
System.exit(job.waitForCompletion(true)?0:1);
}
数据结构
hello,world
hello,haha
hello,hadoop
完整代码
package MapReduce_one;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class JobMainTwo {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
System.setProperty("hadoop.home.dir", "H:\\winutils\\winutils-master\\hadoop-2.6.0");
// Configuration
Configuration conf = new Configuration();
// 生成job
Job job = Job.getInstance(conf, "word count");
// 指定打包类
job.setJarByClass(JobMainTwo.class);
// 指定map类
job.setMapperClass(JobMainTwo.MapperDemo.class);
// 输出得key 和 value 数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setCombinerClass(JobMainTwo.ReduceDemo.class);
// 指定reduce类
job.setReducerClass(JobMainTwo.ReduceDemo.class);
// 输出得key 和 value 数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 读取文件的路径
FileInputFormat.addInputPath(job,new Path("data/java/hello.txt"));
// 输出文件得路径
FileOutputFormat.setOutputPath(job,new Path("data/java/hello-put2"));
// 输出运行结果
System.exit(job.waitForCompletion(true)?0:1);
}
public static class MapperDemo extends Mapper<LongWritable,Text, Text, IntWritable>{
/**
* 声明MapperDemo类 继承 org.apache.hadoop.mapreduce.Mapper; 必须继承这个类
* 继承这个类需要输入泛型 <KEYIN, VALUEIN, KEYOUT, VALUEOUT>
* 这个案列分别是
* KEYIN 编号 int or long 注意java和mapreduce有区别 不能直接给int和 Long 应该是 IntWritable or IntWritable
* VALUEIN 一行数据 string 应该给 Text
* KEYOUT 应该是一个单词 Text
* VALUEOUT 1 IntWritable
*/
private final IntWritable one = new IntWritable(1);
Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
StringTokenizer tokenizer = new StringTokenizer(value.toString(), ",");
while (tokenizer.hasMoreTokens()){
this.text.set(tokenizer.nextToken());
context.write(text, one);
}
}
}
public static class ReduceDemo extends Reducer<Text, IntWritable, Text, IntWritable>{
/**
* 此功能和map很像 继承得类变成了 org.apache.hadoop.mapreduce.Reducer;
* 也需要实现泛型 <> k2 v2 k3 v3
* 示例 reduce方法首先会把同一个k得value放在一起 比如hello <1,1,1,1,> 最后得出得结果就是 hello, 4
*/
private IntWritable count = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
count.set(sum);
}
context.write(key, count);
}
}
}
错误纠正
输出得文件不能重复 如果重复则需要删除 否则报错