分析

适用于关联表中有小表的情形

可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果,可以大大提高合并操作的并发速度

DistributedCacheDriver缓存文件

加快缓存数据

job.addCacheFile(new URI("file://e:/cache/od.txt"));

map端join的逻辑不需要reduce阶段,设置reduceTask数量为0

job.setNumReduceTask(0);

读取缓存的文件数据

setup方法中

获取缓存的文件
循环读取缓存文件一行
切割
缓存数据到集合
关流

map方法中

获取一行
截取
获取订单id
获取商品名称
拼接
写出

代码

延续上面reducer端合并的需求

map类

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
public class DistributedCacheMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
    Map<String, String> pdMap = new HashMap<>();
    Text k = new Text();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取缓存的文件
        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("pd.txt"), "UTF-8"));
        String line;
        while (StringUtils.isNotEmpty(line = reader.readLine())) {
            //切割
            String[] fields = line.split(" ");
            //缓存数据到集合
            pdMap.put(fields[0], fields[1]);
        }
    }
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取一行
        String line = value.toString();
        //截取
        String[] fields = line.split(" ");
        //获取产品id
        String pId = fields[1];
        //获取商品名称
        String pdName = pdMap.get(pId);
        //拼接
        k.set(line + "\t" + pdName);
        //写出
        context.write(k, NullWritable.get());
    }
}

驱动类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class TableDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        //job信息
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        //设置加载jar包路径
        job.setJarByClass(TableDriver.class);
        //关联map
        job.setMapperClass(DistributedCacheMapper.class);
        //设置最终的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        //设置输入文件和输出路径
        FileInputFormat.setInputPaths(job, new Path("/Users/jdxia/Desktop/website/data/input/order.txt"));
        FileOutputFormat.setOutputPath(job, new Path("/Users/jdxia/Desktop/website/data/output"));
        //加载缓存数据
        job.addCacheFile(new URI("file:///Users/jdxia/Desktop/website/data/input/pd.txt"));
        //不需要reduce
        job.setNumReduceTasks(0);
        job.waitForCompletion(true);
    }
}

map端合并(分布式缓存)

分析

代码

map类

驱动类