第一题,建表
create database badou;use badou;create table orders (order_id string, user_id string, eval_set string,order_number string, order_dow string, order_hour_of_day string,days_since_prior_order string)row format delimited fields terminated by ','tblproperties("skip.header.line.count"="1");load data local inpath "/root/orders.csv" into table badou.orders;
查数,第二题,每个用户多少个订单
select * from badou.orders limit 10;
select user_id, count(*) as order_num from orders group by user_id;
建表
create table priors (order_id string, product_id string, add_to_cart_order string, reordered string)
row format delimited fields terminated by ',' tblproperties("skip.header.line.count"="1");
load data local inpath "/root/order_products__train.csv" into table badou.priors;
查数,第三题,每个用户每个订单多少个商品
select * from priors limit 10;
select od.user_id, sum(pc.pro_cnt) as prod_cnt_sum, count(od.order_id) as order_id_sum,
sum(pc.pro_cnt)/count(od.order_id) as avg_pre_pcount
from orders od join
(select order_id,count(*) as pro_cnt from priors group by order_id) pc
on od.order_id = pc.order_id
group by od.user_id
limit 10;
第四题,每个用户,每周几天的订单数分布情况
select user_id,
sum(case order_dow when '0' then 1 else 0 end) dow_0,
sum(case order_dow when '1' then 1 else 0 end) dow_1,
sum(case order_dow when '2' then 1 else 0 end) dow_2,
sum(case order_dow when '3' then 1 else 0 end) dow_3,
sum(case order_dow when '4' then 1 else 0 end) dow_4,
sum(case order_dow when '5' then 1 else 0 end) dow_5,
sum(case order_dow when '6' then 1 else 0 end) dow_6,
from orders group by user_id limit 20;
第五题,每用户每月(30天)平均订单数
select od.user_id,sum(pc.prod_cnt) as prod_cnt_sum,
ceiling(sum(cast(if(days_since_prior_order='','0.0',days_since_prior_order) as float))/30) as mom_cnt,
sum(pc.prod_cnt)/ceiling(sum(cast(if(days_since_prior_order='','0.0',days_since_prior_order) as float))/30) as avg_month_product_num
from orders od join
(select order_id, count(*) as prod_cnt from priors group by order_id)pc
on od.order_id=pc.order_id
group by od.user_id
limit 20;
第六题,每个用户最喜爱购买的三个商品
select user_id, product_id, pro_cnt, row_num from
(select user_id, product_id, pro_cnt, row_number() over(partition by user_id order by pro_cnt desc) as row_num
from
(select user_id,product_id,count(*) as pro_cnt
from orders od join priors pr on od.order_id=pr.order_id
group by user_id,product_id)t
)t1
where row_num<=3
limit 20;
也可以再包一层,形成列表
select user_id,collect_list(product_id) from
(select user_id, product_id, pro_cnt, row_num from
(select user_id, product_id, pro_cnt, row_number() over(partition by user_id order by pro_cnt desc) as row_num
from
(select user_id,product_id,count(*) as pro_cnt
from orders od join priors pr on od.order_id=pr.order_id
group by user_id,product_id)t
)t1 where row_num<=3
)t2
group by user_id
limit 20;
可惜,这个案例无法直接成功运行,报错
Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask
