第一题,建表

    1. create database badou;
    2. use badou;
    3. create table orders (order_id string, user_id string, eval_set string,
    4. order_number string, order_dow string, order_hour_of_day string,
    5. days_since_prior_order string)
    6. row format delimited fields terminated by ','
    7. tblproperties("skip.header.line.count"="1");
    8. load data local inpath "/root/orders.csv" into table badou.orders;

    查数,第二题,每个用户多少个订单

    select * from badou.orders limit 10;
    select user_id, count(*) as order_num from orders group by user_id;
    

    建表

    create table priors (order_id string, product_id string, add_to_cart_order string, reordered string)
      row format delimited fields terminated by ',' tblproperties("skip.header.line.count"="1");
    load data local inpath "/root/order_products__train.csv" into table badou.priors;
    

    查数,第三题,每个用户每个订单多少个商品

    select * from priors limit 10;
    select od.user_id, sum(pc.pro_cnt) as prod_cnt_sum, count(od.order_id) as order_id_sum,
        sum(pc.pro_cnt)/count(od.order_id) as avg_pre_pcount 
    from orders od join 
        (select order_id,count(*) as pro_cnt from priors group by order_id) pc 
      on od.order_id = pc.order_id 
    group by od.user_id
    limit 10;
    

    第四题,每个用户,每周几天的订单数分布情况

    select user_id, 
      sum(case order_dow when '0' then 1 else 0 end) dow_0,
      sum(case order_dow when '1' then 1 else 0 end) dow_1,
      sum(case order_dow when '2' then 1 else 0 end) dow_2,
      sum(case order_dow when '3' then 1 else 0 end) dow_3,
      sum(case order_dow when '4' then 1 else 0 end) dow_4,
      sum(case order_dow when '5' then 1 else 0 end) dow_5,
      sum(case order_dow when '6' then 1 else 0 end) dow_6,
    from orders group by user_id limit 20;
    

    第五题,每用户每月(30天)平均订单数

    select od.user_id,sum(pc.prod_cnt) as prod_cnt_sum,
      ceiling(sum(cast(if(days_since_prior_order='','0.0',days_since_prior_order) as float))/30) as mom_cnt,
      sum(pc.prod_cnt)/ceiling(sum(cast(if(days_since_prior_order='','0.0',days_since_prior_order) as float))/30) as avg_month_product_num
    from orders od join 
        (select order_id, count(*) as prod_cnt from priors group by order_id)pc 
      on od.order_id=pc.order_id
    group by od.user_id
    limit 20;
    

    第六题,每个用户最喜爱购买的三个商品

    select user_id, product_id, pro_cnt, row_num from
      (select user_id, product_id, pro_cnt, row_number() over(partition by user_id order by pro_cnt desc) as row_num 
       from 
         (select user_id,product_id,count(*) as pro_cnt
            from orders od join priors pr on od.order_id=pr.order_id 
            group by user_id,product_id)t
      )t1 
    where row_num<=3 
    limit 20;
    

    也可以再包一层,形成列表

    select user_id,collect_list(product_id) from
      (select user_id, product_id, pro_cnt, row_num from
         (select user_id, product_id, pro_cnt, row_number() over(partition by user_id order by pro_cnt desc) as row_num 
          from
            (select user_id,product_id,count(*) as pro_cnt 
               from orders od join priors pr on od.order_id=pr.order_id 
               group by user_id,product_id)t
         )t1 where row_num<=3
      )t2 
    group by user_id 
    limit 20;
    

    可惜,这个案例无法直接成功运行,报错

    Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
    FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask