- 1.1 DWD 层(业务数据)
- 1.1.1 商品维度表(全量表)
- 1.1.2 优惠券信息表(全量)
- 1.1.3 活动维度表(全量)
- 1.1.4 地区维度表(特殊)
- 1.1.5 时间维度表(特殊)(预留)
- 1.1.6 订单明细事实表(事务型快照事实表)
- 1.1.7 支付事实表(事务型快照事实表)
- 1.1.8 退款事实表(事务型快照事实表)
- 1.1.9 评价事实表(事务型快照事实表)
- 1.1.10 加购事实表(周期型快照事实表,每日快照)
- 1.1.11 收藏事实表(周期型快照事实表,每日快照)
- 1.1.12 优惠券领用事实表(累积型快照事实表)
- 1.1.13 订单事实表(累积型快照事实表)
- 1.1.14 用户维度表(拉链表)
- 1.1.15 DWD 层数据导入脚本
**问题导读:
1.1.1 商品维度表(全量表)
1)建表语句
1. DROP TABLE IF EXISTS `dwd_dim_sku_info`;
2. CREATE EXTERNAL TABLE `dwd_dim_sku_info` (
3. `id` string COMMENT '商品 id',
4. `spu_id` string COMMENT 'spuid',
5. `price` double COMMENT '商品价格',
6. `sku_name` string COMMENT '商品名称',
7. `sku_desc` string COMMENT '商品描述',
8. `weight` double COMMENT '重量',
9. `tm_id` string COMMENT '品牌 id',
10. `tm_name` string COMMENT '品牌名称',
11. `category3_id` string COMMENT '三级分类 id',
12. `category2_id` string COMMENT '二级分类 id',
13. `category1_id` string COMMENT '一级分类 id',
14. `category3_name` string COMMENT '三级分类名称',
15. `category2_name` string COMMENT '二级分类名称',
16. `category1_name` string COMMENT '一级分类名称',
17. `spu_name` string COMMENT 'spu 名称',
18. `create_time` string COMMENT '创建时间'
19. )
20. COMMENT '商品维度表'
21. PARTITIONED BY (`dt` string)
22. stored as parquet
23. location '/warehouse/gmall/dwd/dwd_dim_sku_info/'
24. tblproperties ("parquet.compression"="lzo");
25.
复制代码
2)数据装载
1. insert overwrite table dwd_dim_sku_info partition(dt='2020-03-10')
2. select
3. sku.id,
4. sku.spu_id,
5. sku.price,
6. sku.sku_name,
7. sku.sku_desc,
8. sku.weight,
9. sku.tm_id,
10. ob.tm_name,
11. sku.category3_id,
12. c2.id category2_id,
13. c1.id category1_id,
14. c3.name category3_name,
15. c2.name category2_name,
16. c1.name category1_name,
17. spu.spu_name,
18. sku.create_time
19. from
20. (
21. select * from ods_sku_info where dt='2020-03-10'
22. )sku
23. join
24. (
25. select * from ods_base_trademark where dt='2020-03-10'
26. )ob on sku.tm_id=ob.tm_id
27. join
28. (
29. select * from ods_spu_info where dt='2020-03-10'
30. )spu on spu.id = sku.spu_id
31. join
32. (
33. select * from ods_base_category3 where dt='2020-03-10'
34. )c3 on sku.category3_id=c3.id
35. join
36. (
37. select * from ods_base_category2 where dt='2020-03-10'
38. )c2 on c3.category2_id=c2.id
39. join
40. (
41. select * from ods_base_category1 where dt='2020-03-10'
42. )c1 on c2.category1_id=c1.id;
43.
复制代码
3)查询加载结果
1. select * from dwd_dim_sku_info where dt='2020-03-10';
2.
复制代码
1.1.2 优惠券信息表(全量)
把 ODS 层 ods_coupon_info 表数据导入到 DWD 层优惠卷信息表,在导入过程中可以做适当的清洗
1)建表语句
1. drop table if exists dwd_dim_coupon_info;
2. create external table dwd_dim_coupon_info(
3. `id` string COMMENT '购物券编号',
4. `coupon_name` string COMMENT '购物券名称',
5. `coupon_type` string COMMENT '购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券',
6. `condition_amount` string COMMENT '满额数',
7. `condition_num` string COMMENT '满件数',
8. `activity_id` string COMMENT '活动编号',
9. `benefit_amount` string COMMENT '减金额',
10. `benefit_discount` string COMMENT '折扣',
11. `create_time` string COMMENT '创建时间',
12. `range_type` string COMMENT '范围类型 1、商品 2、品类 3、品牌',
13. `spu_id` string COMMENT '商品 id',
14. `tm_id` string COMMENT '品牌 id',
15. `category3_id` string COMMENT '品类 id',
16. `limit_num` string COMMENT '最多领用次数',
17. `operate_time` string COMMENT '修改时间',
18. `expire_time` string COMMENT '过期时间'
19. ) COMMENT '优惠券信息表'
20. PARTITIONED BY (`dt` string)
21. row format delimited fields terminated by '\t'
22. stored as parquet
23. location '/warehouse/gmall/dwd/dwd_dim_coupon_info/'
24. tblproperties ("parquet.compression"="lzo");
25.
复制代码
2)数据装载
1. insert overwrite table dwd_dim_coupon_info partition(dt='2020-03-10')
2. select
3. id,
4. coupon_name,
5. coupon_type,
6. condition_amount,
7. condition_num,
8. activity_id,
9. benefit_amount,
10. benefit_discount,
11. create_time,
12. range_type,
13. spu_id,
14. tm_id,
15. category3_id,
16. limit_num,
17. operate_time,
18. expire_time
19. from ods_coupon_info
20. where dt='2020-03-10';
21.
复制代码
3)查询加载结果
1. select * from dwd_dim_coupon_info where dt='2020-03-10';
2.
复制代码
1.1.3 活动维度表(全量)
1)建表语句
1. drop table if exists dwd_dim_activity_info;
2. create external table dwd_dim_activity_info(
3. `id` string COMMENT '编号',
4. `activity_name` string COMMENT '活动名称',
5. `activity_type` string COMMENT '活动类型',
6. `condition_amount` string COMMENT '满减金额',
7. `condition_num` string COMMENT '满减件数',
8. `benefit_amount` string COMMENT '优惠金额',
9. `benefit_discount` string COMMENT '优惠折扣',
10. `benefit_level` string COMMENT '优惠级别',
11. `start_time` string COMMENT '开始时间',
12. `end_time` string COMMENT '结束时间',
13. `create_time` string COMMENT '创建时间'
14. ) COMMENT '活动信息表'
15. PARTITIONED BY (`dt` string)
16. row format delimited fields terminated by '\t'
17. stored as parquet
18. location '/warehouse/gmall/dwd/dwd_dim_activity_info/'
19. tblproperties ("parquet.compression"="lzo");
20.
复制代码
2)数据装载
1. insert overwrite table dwd_dim_activity_info partition(dt='2020-03-10')
2. select
3. info.id,
4. info.activity_name,
5. info.activity_type,
6. rule.condition_amount,
7. rule.condition_num,
8. rule.benefit_amount,
9. rule.benefit_discount,
10. rule.benefit_level,
11. info.start_time,
12. info.end_time,
13. info.create_time
14. from
15. (
16. select * from ods_activity_info where dt='2020-03-10'
17. )info
18. left join
19. (
20. select * from ods_activity_rule where dt='2020-03-10'
21. )rule on info.id = rule.activity_id;
22.
复制代码
3)查询加载结果
1. select * from dwd_dim_activity_info where dt='2020-03-10';
复制代码
1.1.4 地区维度表(特殊)
1)建表语句
1. DROP TABLE IF EXISTS `dwd_dim_base_province`;
2. CREATE EXTERNAL TABLE `dwd_dim_base_province` (
3. `id` string COMMENT 'id',
4. `province_name` string COMMENT '省市名称',
5. `area_code` string COMMENT '地区编码',
6. `iso_code` string COMMENT 'ISO 编码',
7. `region_id` string COMMENT '地区 id',
8. `region_name` string COMMENT '地区名称'
9. )
10. COMMENT '地区省市表'
11. stored as parquet
12. location '/warehouse/gmall/dwd/dwd_dim_base_province/'
13. tblproperties ("parquet.compression"="lzo");
14.
复制代码
2)数据装载
1. insert overwrite table dwd_dim_base_province
2. select
3. bp.id,
4. bp.name,
5. bp.area_code,
6. bp.iso_code,
7. bp.region_id,
8. br.region_name
9. from ods_base_province bp
10. join ods_base_region br
11. on bp.region_id=br.id;
12.
复制代码
1.1.5 时间维度表(特殊)(预留)
1)建表语句
1. DROP TABLE IF EXISTS `dwd_dim_date_info`;
2. CREATE EXTERNAL TABLE `dwd_dim_date_info`(
3. `date_id` string COMMENT '日',
4. `week_id` int COMMENT '周',
5. `week_day` int COMMENT '周的第几天',
6. `day` int COMMENT '每月的第几天',
7. `month` int COMMENT '第几月',
8. `quarter` int COMMENT '第几季度',
9. `year` int COMMENT '年',
10. `is_workday` int COMMENT '是否是周末',
11. `holiday_id` int COMMENT '是否是节假日'
12. )
13. row format delimited fields terminated by '\t'
14. stored as parquet
15. location '/warehouse/gmall/dwd/dwd_dim_date_info/'
16. tblproperties ("parquet.compression"="lzo");
17.
复制代码
2)把 date_info.txt 文件上传到 node01 的 /opt/modules/db_log/路径
3)数据装载
1. load data local inpath '/opt/modules/db_log/date_info.txt' into table dwd_dim_date_info;
2.
复制代码
4)查询加载结果
1. select * from dwd_dim_date_info;
2.
复制代码
1.1.6 订单明细事实表(事务型快照事实表)
1)建表语句
1. drop table if exists dwd_fact_order_detail;
2. create external table dwd_fact_order_detail (
3. `id` string COMMENT '订单编号',
4. `order_id` string COMMENT '订单号',
5. `user_id` string COMMENT '用户 id',
6. `sku_id` string COMMENT 'sku 商品 id',
7. `sku_name` string COMMENT '商品名称',
8. `order_price` decimal(10,2) COMMENT '商品价格',
9. `sku_num` bigint COMMENT '商品数量',
10. `create_time` string COMMENT '创建时间',
11. `province_id` string COMMENT '省份 ID',
12. `total_amount` decimal(20,2) COMMENT '订单总金额'
13. )
14. PARTITIONED BY (`dt` string)
15. stored as parquet
16. location '/warehouse/gmall/dwd/dwd_fact_order_detail/'
17. tblproperties ("parquet.compression"="lzo");
18.
复制代码
2)数据装载
1. insert overwrite table dwd_fact_order_detail partition(dt='2020-03-10')
2. select
3. od.id,
4. od.order_id,
5. od.user_id,
6. od.sku_id,
7. od.sku_name,
8. od.order_price,
9. od.sku_num,
10. od.create_time,
11. oi.province_id,
12. od.order_price*od.sku_num
13. from
14. (
15. select * from ods_order_detail where dt='2020-03-10'
16. ) od
17. join
18. (
19. select * from ods_order_info where dt='2020-03-10'
20. ) oi
21. on od.order_id=oi.id;
22.
复制代码
3)查询加载结果
1. select * from dwd_fact_order_detail where dt='2020-03-10';
复制代码
1.1.7 支付事实表(事务型快照事实表)
1)建表语句
1. drop table if exists dwd_fact_payment_info;
2. create external table dwd_fact_payment_info (
3. `id` string COMMENT '',
4. `out_trade_no` string COMMENT '对外业务编号',
5. `order_id` string COMMENT '订单编号',
6. `user_id` string COMMENT '用户编号',
7. `alipay_trade_no` string COMMENT '支付宝交易流水编号',
8. `payment_amount` decimal(16,2) COMMENT '支付金额',
9. `subject` string COMMENT '交易内容',
10. `payment_type` string COMMENT '支付类型',
11. `payment_time` string COMMENT '支付时间',
12. `province_id` string COMMENT '省份 ID'
13. )
14. PARTITIONED BY (`dt` string)
15. stored as parquet
16. location '/warehouse/gmall/dwd/dwd_fact_payment_info/'
17. tblproperties ("parquet.compression"="lzo");
18.
复制代码
2)数据装载
1. insert overwrite table dwd_fact_payment_info partition(dt='2020-03-10')
2. select
3. pi.id,
4. pi.out_trade_no,
5. pi.order_id,
6. pi.user_id,
7. pi.alipay_trade_no,
8. pi.total_amount,
9. pi.subject,
10. pi.payment_type,
11. pi.payment_time,
12. oi.province_id
13. from
14. (
15. select * from ods_payment_info where dt='2020-03-10'
16. )pi
17. join
18. (
19. select id, province_id from ods_order_info where dt='2020-03-10'
20. )oi
21. on pi.order_id = oi.id;
22.
复制代码
3)查询加载结果
1. select * from dwd_fact_payment_info where dt='2020-03-10';
2.
复制代码
1.1.8 退款事实表(事务型快照事实表)
把 ODS 层 ods_order_refund_info 表数据导入到 DWD 层退款事实表,在导入过程中可以做适当的清洗
1)建表语句
1. drop table if exists dwd_fact_order_refund_info;
2. create external table dwd_fact_order_refund_info(
3. `id` string COMMENT '编号',
4. `user_id` string COMMENT '用户 ID',
5. `order_id` string COMMENT '订单 ID',
6. `sku_id` string COMMENT '商品 ID',
7. `refund_type` string COMMENT '退款类型',
8. `refund_num` bigint COMMENT '退款件数',
9. `refund_amount` decimal(16,2) COMMENT '退款金额',
10. `refund_reason_type` string COMMENT '退款原因类型',
11. `create_time` string COMMENT '退款时间'
12. ) COMMENT '退款事实表'
13. PARTITIONED BY (`dt` string)
14. row format delimited fields terminated by '\t'
15. location '/warehouse/gmall/dwd/dwd_fact_order_refund_info/';
16.
复制代码
2)数据装载
1. insert overwrite table dwd_fact_order_refund_info partition(dt='2020-03-10')
2. select
3. id,
4. user_id,
5. order_id,
6. sku_id,
7. refund_type,
8. refund_num,
9. refund_amount,
10. refund_reason_type,
11. create_time
12. from ods_order_refund_info
13. where dt='2020-03-10';
14.
复制代码
3)查询加载结果
1. select * from dwd_fact_order_refund_info where dt='2020-03-10';
2.
复制代码
1.1.9 评价事实表(事务型快照事实表)
把 ODS 层 ods_comment_info 表数据导入到 DWD 层评价事实表,在导入过程中可以做适当的清洗
1)建表语句
1. drop table if exists dwd_fact_comment_info;
2. create external table dwd_fact_comment_info(
3. `id` string COMMENT '编号',
4. `user_id` string COMMENT '用户 ID',
5. `sku_id` string COMMENT '商品 sku',
6. `spu_id` string COMMENT '商品 spu',
7. `order_id` string COMMENT '订单 ID',
8. `appraise` string COMMENT '评价',
9. `create_time` string COMMENT '评价时间'
10. ) COMMENT '评价事实表'
11. PARTITIONED BY (`dt` string)
12. row format delimited fields terminated by '\t'
13. location '/warehouse/gmall/dwd/dwd_fact_comment_info/';
14.
复制代码
2)数据装载
1. insert overwrite table dwd_fact_comment_info partition(dt='2020-03-10')
2. select
3. id,
4. user_id,
5. sku_id,
6. spu_id,
7. order_id,
8. appraise,
9. create_time
10. from ods_comment_info
11. where dt='2020-03-10';
12.
复制代码
3)查询加载结果
1. select * from dwd_fact_comment_info where dt='2020-03-10';
2.
复制代码
1.1.10 加购事实表(周期型快照事实表,每日快照)
由于购物车的数量是会发生变化,所以导增量不合适
每天做一次快照,导入的数据是全量,区别于事务型事实表是每天导入新增
周期型快照事实表劣势:存储的数据量会比较大
解决方案:周期型快照事实表存储的数据比较讲究时效性,时间太久了的意义不大,可以删除以前的数据
1)建表语句
1. drop table if exists dwd_fact_cart_info;
2. create external table dwd_fact_cart_info(
3. `id` string COMMENT '编号',
4. `user_id` string COMMENT '用户 id',
5. `sku_id` string COMMENT 'skuid',
6. `cart_price` string COMMENT '放入购物车时价格',
7. `sku_num` string COMMENT '数量',
8. `sku_name` string COMMENT 'sku 名称 (冗余)',
9. `create_time` string COMMENT '创建时间',
10. `operate_time` string COMMENT '修改时间',
11. `is_ordered` string COMMENT '是否已经下单。1 为已下单;0 为未下单',
12. `order_time` string COMMENT '下单时间'
13. ) COMMENT '加购事实表'
14. PARTITIONED BY (`dt` string)
15. row format delimited fields terminated by '\t'
16. location '/warehouse/gmall/dwd/dwd_fact_cart_info/';
17.
复制代码
2)数据装载
1. insert overwrite table dwd_fact_cart_info partition(dt='2020-03-10')
2. select
3. id,
4. user_id,
5. sku_id,
6. cart_price,
7. sku_num,
8. sku_name,
9. create_time,
10. operate_time,
11. is_ordered,
12. order_time
13. from ods_cart_info
14. where dt='2020-03-10';
15.
复制代码
3)查询加载结果
1. select * from dwd_fact_cart_info where dt='2020-03-10';
2.
复制代码
1.1.11 收藏事实表(周期型快照事实表,每日快照)
收藏的标记,是否取消,会发生变化,做增量不合适
每天做一次快照,导入的数据是全量,区别于事务型事实表是每天导入新增
1)建表语句
1. drop table if exists dwd_fact_favor_info;
2. create external table dwd_fact_favor_info(
3. `id` string COMMENT '编号',
4. `user_id` string COMMENT '用户 id',
5. `sku_id` string COMMENT 'skuid',
6. `spu_id` string COMMENT 'spuid',
7. `is_cancel` string COMMENT '是否取消',
8. `create_time` string COMMENT '收藏时间',
9. `cancel_time` string COMMENT '取消时间'
10. ) COMMENT '收藏事实表'
11. PARTITIONED BY (`dt` string)
12. row format delimited fields terminated by '\t'
13. location '/warehouse/gmall/dwd/dwd_fact_favor_info/';
14.
复制代码
2)数据装载
1. insert overwrite table dwd_fact_favor_info partition(dt='2020-03-10')
2. select
3. id,
4. user_id,
5. sku_id,
6. spu_id,
7. is_cancel,
8. create_time,
9. cancel_time
10. from ods_favor_info
11. where dt='2020-03-10';
12.
复制代码
3)查询加载结果
1. select * from dwd_fact_favor_info where dt='2020-03-10';
2.
复制代码
1.1.12 优惠券领用事实表(累积型快照事实表)
优惠卷的生命周期:领取优惠卷-》用优惠卷下单-》优惠卷参与支付
累积型快照事实表使用:统计优惠卷领取次数、优惠卷下单次数、优惠卷参与支付次数
1)建表语句
1. drop table if exists dwd_fact_coupon_use;
2. create external table dwd_fact_coupon_use(
3. `id` string COMMENT '编号',
4. `coupon_id` string COMMENT '优惠券 ID',
5. `user_id` string COMMENT 'userid',
6. `order_id` string COMMENT '订单 id',
7. `coupon_status` string COMMENT '优惠券状态',
8. `get_time` string COMMENT '领取时间',
9. `using_time` string COMMENT '使用时间(下单)',
10. `used_time` string COMMENT '使用时间(支付)'
11. ) COMMENT '优惠券领用事实表'
12. PARTITIONED BY (`dt` string)
13. row format delimited fields terminated by '\t'
14. location '/warehouse/gmall/dwd/dwd_fact_coupon_use/';
15.
复制代码
注意:dt 是按照优惠卷领用时间 get_time 做为分区
2)数据装载
1. set hive.exec.dynamic.partition.mode=nonstrict;
2. insert overwrite table dwd_fact_coupon_use partition(dt)
3. select
4. if(new.id is null,old.id,new.id),
5. if(new.coupon_id is null,old.coupon_id,new.coupon_id),
6. if(new.user_id is null,old.user_id,new.user_id),
7. if(new.order_id is null,old.order_id,new.order_id),
8. if(new.coupon_status is null,old.coupon_status,new.coupon_status),
9. if(new.get_time is null,old.get_time,new.get_time),
10. if(new.using_time is null,old.using_time,new.using_time),
11. if(new.used_time is null,old.used_time,new.used_time),
12. date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')
13. from
14. (
15. select
16. id,
17. coupon_id,
18. user_id,
19. order_id,
20. coupon_status,
21. get_time,
22. using_time,
23. used_time
24. from dwd_fact_coupon_use
25. where dt in
26. (
27. select
28. date_format(get_time,'yyyy-MM-dd')
29. from ods_coupon_use
30. where dt='2020-03-10'
31. )
32. )old
33. full outer join
34. (
35. select
36. id,
37. coupon_id,
38. user_id,
39. order_id,
40. coupon_status,
41. get_time,
42. using_time,
43. used_time
44. from ods_coupon_use
45. where dt='2020-03-10'
46. )new
47. on old.id=new.id;
48.
复制代码
3)查询加载结果
1. select * from dwd_fact_coupon_use where dt='2020-03-10';
2.
复制代码
1.1.13 订单事实表(累积型快照事实表)
1)concat 函数
concat 函数在连接字符串的时候,只要其中一个是 NULL,那么将返回 NULL
1. hive> select concat('a','b');
2. ab
3. hive> select concat('a','b',null);
4. NULL
5.
复制代码
2)concat_ws 函数
concat_ws 函数在连接字符串的时候,只要有一个字符串不是 NULL,就不会返回 NULL。concat_ws 函数需要指定分隔符
1. hive> select concat_ws('-','a','b');
2. a-b
3. hive> select concat_ws('-','a','b',null);
4. a-b
5. hive> select concat_ws('','a','b',null);
6. ab
7.
复制代码
3)STR_TO_MAP 函数
- (1)语法描述
STR_TO_MAP(VARCHAR text, VARCHAR listDelimiter, VARCHAR keyValueDelimiter)
- (2)功能描述
使用 listDelimiter 将 text 分隔成 K-V 对,然后使用 keyValueDelimiter 分隔每个 K-V 对,
组装成 MAP 返回。默认 listDelimiter 为( ,),keyValueDelimiter 为(=)。
- (3)案例
str_to_map(‘1001=2020-03-10,1002=2020-03-10’, ‘,’ , ‘=’)
输出{“1001”:“2020-03-10”,“1002”:“2020-03-10”}
4)建表语句
订单生命周期:创建时间=》支付时间=》取消时间=》完成时间=》退款时间=》退款完成时间
由于 ODS 层订单表只有创建时间和操作时间两个状态,不能表达所有时间含义,所以需要关联订单状态表。订单事实表里面增加了活动 id,所以需要关联活动订单表
1. drop table if exists dwd_fact_order_info;
2. create external table dwd_fact_order_info (
3. `id` string COMMENT '订单编号',
4. `order_status` string COMMENT '订单状态',
5. `user_id` string COMMENT '用户 id',
6. `out_trade_no` string COMMENT '支付流水号',
7. `create_time` string COMMENT '创建时间(未支付状态)',
8. `payment_time` string COMMENT '支付时间(已支付状态)',
9. `cancel_time` string COMMENT '取消时间(已取消状态)',
10. `finish_time` string COMMENT '完成时间(已完成状态)',
11. `refund_time` string COMMENT '退款时间(退款中状态)',
12. `refund_finish_time` string COMMENT '退款完成时间(退款完成状态)',
13. `province_id` string COMMENT '省份 ID',
14. `activity_id` string COMMENT '活动 ID',
15. `original_total_amount` string COMMENT '原价金额',
16. `benefit_reduce_amount` string COMMENT '优惠金额',
17. `feight_fee` string COMMENT '运费',
18. `final_total_amount` decimal(10,2) COMMENT '订单金额'
19. )
20. PARTITIONED BY (`dt` string)
21. stored as parquet
22. location '/warehouse/gmall/dwd/dwd_fact_order_info/'
23. tblproperties ("parquet.compression"="lzo");
24.
复制代码
5)数据装载
5)常用函数
更多函数请点击博客【HIve】Hive入门解析(五)
6)数据装载
1. set hive.exec.dynamic.partition.mode=nonstrict;
2. insert overwrite table dwd_fact_order_info partition(dt)
3. select
4. if(new.id is null,old.id,new.id),
5. if(new.order_status is null,old.order_status,new.order_status),
6. if(new.user_id is null,old.user_id,new.user_id),
7. if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
8. if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001 对应未支付状态
9. if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),
10. if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),
11. if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),
12. if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),
13. if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),
14. if(new.province_id is null,old.province_id,new.province_id),
15. if(new.activity_id is null,old.activity_id,new.activity_id),
16. if(new.original_total_amount is
17. null,old.original_total_amount,new.original_total_amount),
18. if(new.benefit_reduce_amount is
19. null,old.benefit_reduce_amount,new.benefit_reduce_amount),
20. if(new.feight_fee is null,old.feight_fee,new.feight_fee),
21. if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount),
22. date_format(if(new.tms['1001'] is
23. null,old.create_time,new.tms['1001']),'yyyy-MM-dd')
24. from
25. (
26. select
27. id,
28. order_status,
29. user_id,
30. out_trade_no,
31. create_time,
32. payment_time,
33. cancel_time,
34. finish_time,
35. refund_time,
36. refund_finish_time,
37. province_id,
38. activity_id,
39. original_total_amount,
40. benefit_reduce_amount,
41. feight_fee,
42. final_total_amount
43. from dwd_fact_order_info
44. where dt
45. in
46. (
47. select
48. date_format(create_time,'yyyy-MM-dd')
49. from ods_order_info
50. where dt='2020-03-10'
51. )
52. )old
53. full outer join
54. (
55. select
56. info.id,
57. info.order_status,
58. info.user_id,
59. info.out_trade_no,
60. info.province_id,
61. act.activity_id,
62. log.tms,
63. info.original_total_amount,
64. info.benefit_reduce_amount,
65. info.feight_fee,
66. info.final_total_amount
67. from
68. (
69. select
70. order_id,
71. str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=')
72. tms
73. from ods_order_status_log
74. where dt='2020-03-10'
75. group by order_id
76. )log
77. join
78. (
79. select * from ods_order_info where dt='2020-03-10'
80. )info
81. on log.order_id=info.id
82. left join
83. (
84. select * from ods_activity_order where dt='2020-03-10'
85. )act
86. on log.order_id=act.order_id
87. )new
88. on old.id=new.id;
复制代码
6)查询加载结果
1. select * from dwd_fact_order_info where dt='2020-03-10';
2.
复制代码
1.1.14 用户维度表(拉链表)
用户表中的数据每日既有可能新增,也有可能修改,但修改频率并不高,属于缓慢变化
维度,此处采用拉链表存储用户维度数据
1)什么是拉链表
2)为什么要做拉链表
3)拉链表形成过程
4)拉链表制作过程图
5)拉链表制作过程
步骤 0:初始化拉链表(首次独立执行)
(1)建立拉链表
1. drop table if exists dwd_dim_user_info_his;
2. create external table dwd_dim_user_info_his(
3. `id` string COMMENT '用户 id',
4. `name` string COMMENT '姓名',
5. `birthday` string COMMENT '生日',
6. `gender` string COMMENT '性别',
7. `email` string COMMENT '邮箱',
8. `user_level` string COMMENT '用户等级',
9. `create_time` string COMMENT '创建时间',
10. `operate_time` string COMMENT '操作时间',
11. `start_date` string COMMENT '有效开始日期',
12. `end_date` string COMMENT '有效结束日期'
13. ) COMMENT '订单拉链表'
14. stored as parquet
15. location '/warehouse/gmall/dwd/dwd_dim_user_info_his/'
16. tblproperties ("parquet.compression"="lzo");
复制代码
(2)初始化拉链表
1. insert overwrite table dwd_dim_user_info_his
2. select
3. id,
4. name,
5. birthday,
6. gender,
7. email,
8. user_level,
9. create_time,
10. operate_time,
11. '2020-03-10',
12. '9999-99-99'
13. from ods_user_info oi
14. where oi.dt='2020-03-10';
15.
复制代码
步骤 1:制作当日变动数据(包括新增,修改)每日执行
(1)如何获得每日变动表
- a.最好表内有创建时间和变动时间(Lucky!)
- b.如果没有,可以利用第三方工具监控比如 canal,监控 MySQL 的实时变化进行记录(麻烦)
- c.逐行对比前后两天的数据,检查 md5(concat(全部有可能变化的字段))是否相同(low)
- d.要求业务数据库提供变动流水(人品,颜值)
(2)因为 ods_order_info 本身导入过来就是新增变动明细的表,所以不用处理
- a)数据库中新增 2020-03-11 一天的数据
- b)通过 Sqoop 把 2020-03-11 日所有数据导入mysqlTohdfs.sh all 2020-03-11
- c)ods 层数据导入hdfs_to_ods_db.sh all 2020-03-11
步骤 2:先合并变动信息,再追加新增信息,插入到临时表中
1)建立临时表
1. drop table if exists dwd_dim_user_info_his_tmp;
2. create external table dwd_dim_user_info_his_tmp(
3. `id` string COMMENT '用户 id',
4. `name` string COMMENT '姓名',
5. `birthday` string COMMENT '生日',
6. `gender` string COMMENT '性别',
7. `email` string COMMENT '邮箱',
8. `user_level` string COMMENT '用户等级',
9. `create_time` string COMMENT '创建时间',
10. `operate_time` string COMMENT '操作时间',
11. `start_date` string COMMENT '有效开始日期',
12. `end_date` string COMMENT '有效结束日期'
13. ) COMMENT '订单拉链临时表'
14. stored as parquet
15. location '/warehouse/gmall/dwd/dwd_dim_user_info_his_tmp/'
16. tblproperties ("parquet.compression"="lzo");
17.
复制代码
2)导入脚本
1. insert overwrite table dwd_dim_user_info_his_tmp
2. select * from
3. (
4. select
5. id,
6. name,
7. birthday,
8. gender,
9. email,
10. user_level,
11. create_time,
12. operate_time,
13. '2020-03-11' start_date,
14. '9999-99-99' end_date
15. from ods_user_info where dt='2020-03-11'
16. union all
17. select
18. uh.id,
19. uh.name,
20. uh.birthday,
21. uh.gender,
22. uh.email,
23. uh.user_level,
24. uh.create_time,
25. uh.operate_time,
26. uh.start_date,
27. if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1),
28. uh.end_date) end_date
29. from dwd_dim_user_info_his uh left join
30. (
31. select
32. *
33. from ods_user_info
34. where dt='2020-03-11'
35. ) ui on uh.id=ui.id
36. )his
37. order by his.id, start_date;
复制代码
步骤 3:把临时表覆盖给拉链表
1)导入数据
1. insert overwrite table dwd_dim_user_info_his
2. select * from dwd_dim_user_info_his_tmp;
3.
复制代码
2)查询导入数据
1. select id, start_date, end_date from dwd_dim_user_info_his;
2.
复制代码
1.1.15 DWD 层数据导入脚本
1)vim ods_to_dwd_db.sh
1. #!/bin/bash
2. APP=gmall
3. hive=/opt/modules/hive/bin/hive
4. # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
5. if [ -n "$2" ] ;then
6. do_date=$2
7. else
8. do_date=`date -d "-1 day" +%F`
9. fi
10. sql1="
11. set hive.exec.dynamic.partition.mode=nonstrict;
12.
13. insert overwrite table ${APP}.dwd_dim_sku_info partition(dt='$do_date')
14. select
15. sku.id,
16. sku.spu_id,
17. sku.price,
18. sku.sku_name,
19. sku.sku_desc,
20. sku.weight,
21. sku.tm_id,
22. ob.tm_name,
23. sku.category3_id,
24. c2.id category2_id,
25. c1.id category1_id,
26. c3.name category3_name,
27. c2.name category2_name,
28. c1.name category1_name,
29. spu.spu_name,
30. sku.create_time
31. from
32. (
33. select * from ${APP}.ods_sku_info where dt='$do_date'
34. )sku
35. join
36. (
37. select * from ${APP}.ods_base_trademark where dt='$do_date'
38. )ob on sku.tm_id=ob.tm_id
39. join
40. (
41. select * from ${APP}.ods_spu_info where dt='$do_date'
42. )spu on spu.id = sku.spu_id
43. join
44. (
45. select * from ${APP}.ods_base_category3 where dt='$do_date'
46. )c3 on sku.category3_id=c3.id
47. join
48. (
49. select * from ${APP}.ods_base_category2 where dt='$do_date'
50. )c2 on c3.category2_id=c2.id
51. join
52. (
53. select * from ${APP}.ods_base_category1 where dt='$do_date'
54. )c1 on c2.category1_id=c1.id;
55.
56.
57. insert overwrite table ${APP}.dwd_dim_coupon_info partition(dt='$do_date')
58. select
59. id,
60. coupon_name,
61. coupon_type,
62. condition_amount,
63. condition_num,
64. activity_id,
65. benefit_amount,
66. benefit_discount,
67. create_time,
68. range_type,
69. spu_id,
70. tm_id,
71. category3_id,
72. limit_num,
73. operate_time,
74. expire_time
75. from ${APP}.ods_coupon_info
76. where dt='$do_date';
77.
78.
79. insert overwrite table ${APP}.dwd_dim_activity_info partition(dt='$do_date')
80. select
81. info.id,
82. info.activity_name,
83. info.activity_type,
84. rule.condition_amount,
85. rule.condition_num,
86. rule.benefit_amount,
87. rule.benefit_discount,
88. rule.benefit_level,
89. info.start_time,
90. info.end_time,
91. info.create_time
92. from
93. (
94. select * from ${APP}.ods_activity_info where dt='$do_date'
95. )info
96. left join
97. (
98. select * from ${APP}.ods_activity_rule where dt='$do_date'
99. )rule on info.id = rule.activity_id;
100.
101.
102. insert overwrite table ${APP}.dwd_fact_order_detail partition(dt='$do_date')
103. select
104. od.id,
105. od.order_id,
106. od.user_id,
107. od.sku_id,
108. od.sku_name,
109. od.order_price,
110. od.sku_num,
111. od.create_time,
112. oi.province_id,
113. od.order_price*od.sku_num
114. from
115. (
116. select * from ${APP}.ods_order_detail where dt='$do_date'
117. ) od
118. join
119. (
120. select * from ${APP}.ods_order_info where dt='$do_date'
121. ) oi
122. on od.order_id=oi.id;
123.
124.
125. insert overwrite table ${APP}.dwd_fact_payment_info partition(dt='$do_date')
126. select
127. pi.id,
128. pi.out_trade_no,
129. pi.order_id,
130. pi.user_id,
131. pi.alipay_trade_no,
132. pi.total_amount,
133. pi.subject,
134. pi.payment_type,
135. pi.payment_time,
136. oi.province_id
137. from
138. (
139. select * from ${APP}.ods_payment_info where dt='$do_date'
140. )pi
141. join
142. (
143. select id, province_id from ${APP}.ods_order_info where dt='$do_date'
144. )oi
145. on pi.order_id = oi.id;
146.
147.
148. insert overwrite table ${APP}.dwd_fact_order_refund_info partition(dt='$do_date')
149. select
150. id,
151. user_id,
152. order_id,
153. sku_id,
154. refund_type,
155. refund_num,
156. refund_amount,
157. refund_reason_type,
158. create_time
159. from ${APP}.ods_order_refund_info
160. where dt='$do_date';
161.
162.
163. insert overwrite table ${APP}.dwd_fact_comment_info partition(dt='$do_date')
164. select
165. id,
166. user_id,
167. sku_id,
168. spu_id,
169. order_id,
170. appraise,
171. create_time
172. from ${APP}.ods_comment_info
173. where dt='$do_date';
174.
175.
176. insert overwrite table ${APP}.dwd_fact_cart_info partition(dt='$do_date')
177. select
178. id,
179. user_id,
180. sku_id,
181. cart_price,
182. sku_num,
183. sku_name,
184. create_time,
185. operate_time,
186. is_ordered,
187. order_time
188. from ${APP}.ods_cart_info
189. where dt='$do_date';
190.
191.
192. insert overwrite table ${APP}.dwd_fact_favor_info partition(dt='$do_date')
193. select
194. id,
195. user_id,
196. sku_id,
197. spu_id,
198. is_cancel,
199. create_time,
200. cancel_time
201. from ${APP}.ods_favor_info
202. where dt='$do_date';
203.
204.
205. insert overwrite table ${APP}.dwd_fact_coupon_use partition(dt)
206. select
207. if(new.id is null,old.id,new.id),
208. if(new.coupon_id is null,old.coupon_id,new.coupon_id),
209. if(new.user_id is null,old.user_id,new.user_id),
210. if(new.order_id is null,old.order_id,new.order_id),
211. if(new.coupon_status is null,old.coupon_status,new.coupon_status),
212. if(new.get_time is null,old.get_time,new.get_time),
213. if(new.using_time is null,old.using_time,new.using_time),
214. if(new.used_time is null,old.used_time,new.used_time),
215. date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')
216. from
217. (
218. select
219. id,
220. coupon_id,
221. user_id,
222. order_id,
223. coupon_status,
224. get_time,
225. using_time,
226. used_time
227. from ${APP}.dwd_fact_coupon_use
228. where dt in
229. (
230. select
231. date_format(get_time,'yyyy-MM-dd')
232. from ${APP}.ods_coupon_use
233. where dt='$do_date'
234. )
235. )old
236. full outer join
237. (
238. select
239. id,
240. coupon_id,
241. user_id,
242. order_id,
243. coupon_status,
244. get_time,
245. using_time,
246. used_time
247. from ${APP}.ods_coupon_use
248. where dt='$do_date'
249. )new
250. on old.id=new.id;
251.
252.
253. insert overwrite table ${APP}.dwd_fact_order_info partition(dt)
254. select
255. if(new.id is null,old.id,new.id),
256. if(new.order_status is null,old.order_status,new.order_status),
257. if(new.user_id is null,old.user_id,new.user_id),
258. if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
259. if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001 对应未支付状态
260. if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),
261. if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),
262. if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),
263. if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),
264. if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),
265. if(new.province_id is null,old.province_id,new.province_id),
266. if(new.activity_id is null,old.activity_id,new.activity_id),
267. if(new.original_total_amount is
268. null,old.original_total_amount,new.original_total_amount),
269. if(new.benefit_reduce_amount is
270. null,old.benefit_reduce_amount,new.benefit_reduce_amount),
271. if(new.feight_fee is null,old.feight_fee,new.feight_fee),
272. if(new.final_total_amount is
273. null,old.final_total_amount,new.final_total_amount),
274. date_format(if(new.tms['1001'] is
275. null,old.create_time,new.tms['1001']),'yyyy-MM-dd')
276. from
277. (
278. select
279. id,
280. order_status,
281. user_id,
282. out_trade_no,
283. create_time,
284. payment_time,
285. cancel_time,
286. finish_time,
287. refund_time,
288. refund_finish_time,
289. province_id,
290. activity_id,
291. original_total_amount,
292. benefit_reduce_amount,
293. feight_fee,
294. final_total_amount
295. from ${APP}.dwd_fact_order_info
296. where dt
297. in
298. (
299. select
300. date_format(create_time,'yyyy-MM-dd')
301. from ${APP}.ods_order_info
302. where dt='$do_date'
303. )
304. )old
305. full outer join
306. (
307. select
308. info.id,
309. info.order_status,
310. info.user_id,
311. info.out_trade_no,
312. info.province_id,
313. act.activity_id,
314. log.tms,
315. info.original_total_amount,
316. info.benefit_reduce_amount,
317. info.feight_fee,
318. info.final_total_amount
319. from
320. (
321. select
322. order_id,
323. str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','
324. =') tms
325. from ${APP}.ods_order_status_log
326. where dt='$do_date'
327. group by order_id
328. )log
329. join
330. (
331. select * from ${APP}.ods_order_info where dt='$do_date'
332. )info
333. on log.order_id=info.id
334. left join
335. (
336. select * from ${APP}.ods_activity_order where dt='$do_date'
337. )act
338. on log.order_id=act.order_id
339. )new
340. on old.id=new.id;
341.
342.
343. insert overwrite table ${APP}.dwd_dim_user_info_his_tmp
344. select * from
345. (
346. select
347. id,
348. name,
349. birthday,
350. gender,
351. email,
352. user_level,
353. create_time,
354. operate_time,
355. '$do_date' start_date,
356. '9999-99-99' end_date
357. from ${APP}.ods_user_info where dt='$do_date'
358. union all
359. select
360. uh.id,
361. uh.name,
362. uh.birthday,
363. uh.gender,
364. uh.email,
365. uh.user_level,
366. uh.create_time,
367. uh.operate_time,
368. uh.start_date,
369. if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1),
370. uh.end_date) end_date
371. from ${APP}.dwd_dim_user_info_his uh left join
372. (
373. select
374. *
375. from ${APP}.ods_user_info
376. where dt='$do_date'
377. ) ui on uh.id=ui.id
378. )his
379. order by his.id, start_date;
380.
381.
382. insert overwrite table ${APP}.dwd_dim_user_info_his select * from
383. ${APP}.dwd_dim_user_info_his_tmp;
384. "
385.
386. sql2="
387. insert overwrite table ${APP}.dwd_dim_base_province
388. select
389. bp.id,
390. bp.name,
391. bp.area_code,
392. bp.iso_code,
393. bp.region_id,
394. br.region_name
395. from ${APP}.ods_base_province bp
396. join ${APP}.ods_base_region br
397. on bp.region_id=br.id;
398. "
399.
400. case $1 in
401. "first"){
402. $hive -e "$sql1"
403. $hive -e "$sql2"
404. };;
405. "all"){
406. $hive -e "$sql1"
407. };;
408. esac
409.
复制代码
2)增加脚本执行权限
1. chmod 770 ods_to_dwd_db.sh
2.
复制代码
3)执行脚本导入数据
1. ods_to_dwd_db.sh all 2020-03-11
2.
复制代码
4)查看导入数据
1. select * from dwd_fact_order_info where dt='2020-03-11';
2. select * from dwd_fact_order_detail where dt='2020-03-11';
3. select * from dwd_fact_comment_info where dt='2020-03-11';
4. select * from dwd_fact_order_refund_info where dt='2020-03-11';
5.
复制代码
结束语
本章着重介绍了DWD层的业务数据的搭建流程,自此,DWD层搭建完成,下章开启DWS层的搭建!!!