分析

适用于关联表中有小表的情形

可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果,可以大大提高合并操作的并发速度

DistributedCacheDriver缓存文件

  1. 加快缓存数据
  1. job.addCacheFile(new URI("file://e:/cache/od.txt"));
  1. map端join的逻辑不需要reduce阶段,设置reduceTask数量为0
  1. job.setNumReduceTask(0);

读取缓存的文件数据

setup方法中

  1. 获取缓存的文件
  2. 循环读取缓存文件一行
  3. 切割
  4. 缓存数据到集合
  5. 关流

map方法中

  1. 获取一行
  2. 截取
  3. 获取订单id
  4. 获取商品名称
  5. 拼接
  6. 写出

代码

延续上面reducer端合并的需求

map类

  1. import org.apache.commons.lang.StringUtils;
  2. import org.apache.hadoop.io.LongWritable;
  3. import org.apache.hadoop.io.NullWritable;
  4. import org.apache.hadoop.io.Text;
  5. import org.apache.hadoop.mapreduce.Mapper;
  6. import java.io.BufferedReader;
  7. import java.io.FileInputStream;
  8. import java.io.IOException;
  9. import java.io.InputStreamReader;
  10. import java.util.HashMap;
  11. import java.util.Map;
  12. public class DistributedCacheMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
  13. Map<String, String> pdMap = new HashMap<>();
  14. Text k = new Text();
  15. @Override
  16. protected void setup(Context context) throws IOException, InterruptedException {
  17. //获取缓存的文件
  18. BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("pd.txt"), "UTF-8"));
  19. String line;
  20. while (StringUtils.isNotEmpty(line = reader.readLine())) {
  21. //切割
  22. String[] fields = line.split(" ");
  23. //缓存数据到集合
  24. pdMap.put(fields[0], fields[1]);
  25. }
  26. }
  27. @Override
  28. protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
  29. //获取一行
  30. String line = value.toString();
  31. //截取
  32. String[] fields = line.split(" ");
  33. //获取产品id
  34. String pId = fields[1];
  35. //获取商品名称
  36. String pdName = pdMap.get(pId);
  37. //拼接
  38. k.set(line + "\t" + pdName);
  39. //写出
  40. context.write(k, NullWritable.get());
  41. }
  42. }

驱动类

  1. import org.apache.hadoop.conf.Configuration;
  2. import org.apache.hadoop.fs.Path;
  3. import org.apache.hadoop.io.NullWritable;
  4. import org.apache.hadoop.io.Text;
  5. import org.apache.hadoop.mapreduce.Job;
  6. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  7. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  8. import java.io.IOException;
  9. import java.net.URI;
  10. import java.net.URISyntaxException;
  11. public class TableDriver {
  12. public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
  13. //job信息
  14. Configuration conf = new Configuration();
  15. Job job = Job.getInstance(conf);
  16. //设置加载jar包路径
  17. job.setJarByClass(TableDriver.class);
  18. //关联map
  19. job.setMapperClass(DistributedCacheMapper.class);
  20. //设置最终的输出类型
  21. job.setOutputKeyClass(Text.class);
  22. job.setOutputValueClass(NullWritable.class);
  23. //设置输入文件和输出路径
  24. FileInputFormat.setInputPaths(job, new Path("/Users/jdxia/Desktop/website/data/input/order.txt"));
  25. FileOutputFormat.setOutputPath(job, new Path("/Users/jdxia/Desktop/website/data/output"));
  26. //加载缓存数据
  27. job.addCacheFile(new URI("file:///Users/jdxia/Desktop/website/data/input/pd.txt"));
  28. //不需要reduce
  29. job.setNumReduceTasks(0);
  30. job.waitForCompletion(true);
  31. }
  32. }