1. 建表语句

1.1. 创建外部分区表并指定压缩

  1. drop table if exists dwd_base_event_log;
  2. create external table dwd_base_event_log(
  3. `mid_id` string,
  4. `user_id` string,
  5. ...,
  6. `extend1` string
  7. )
  8. partition by (dt string)
  9. stored as parquet
  10. location '/warehouse/gmall/dwd/dwd_base_event_log/';

1.2. 创建外部表并声明文件分隔符

drop table if exists dwd_base_event_log;
create external table location(
    id int,
  name string,
  loc string
)
row format delimited fields terminated by '\t';

2. 向表导入数据

2.1. 从HDFS导入数据到表中

load data inpath '/origin_data/gmall/log/topic_event/2020-01-01' 
into table gmall.ods_event_log partition(dt='2020-01-01');

2.2. 从本地导入数据到表中

load data local inpath '/opt/module/data/location'
into table location;

2.2. 静态分区,从其他表中拉取数据

(设置非严格模式)

set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table dwd_start_log
partition (dt = "2020-01-01")
select 
    get_json_object(line,'$.uid') mid_id,
  ...,
  get_json_object(line,'$.extend1') extend1
from ods_start_log
where dt='2020-01-01';

2.3. 动态分区,从其他表中拉取数据

(设置非严格模式、动态分区)

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;

insert overwrite table dwd_start_log
partition (dt)
select 
    get_json_object(line,'$.uid') mid_id,
  ...,
  get_json_object(line,'$.extend1') extend1
from ods_start_log
where dt='2020-01-01';

3. Join

3.1. join

见附录:表1

select stu.id,stu.name,stu.age,location.id,location.name,location.loc
from stu join location
on stu.id = location.id and stu.name=location.name;

结果如下:

stu.id stu.name stu.age location.id location.name location.loc
1 zhangsan 23 1 zhangsan nanjing
2 lisi 24 2 lisi beijing

3.2. left join

见附录:表1

select stu.id,stu.name,stu.age,score.id,score.score 
from stu left join score 
on stu.id = score.id;

结果如下:

stu.id stu.name stu.age score.id score.score
1 zhangsan 23 1 100
2 lisi 24 null null
3 wangwu 25 3 85

3.3. right join

见附录:表1

select stu.id,stu.name,stu.age,score.id,score.score 
from stu right join score 
on stu.id = score.id;

结果如下:

stu.id stu.name stu.age score.id score.score
1 zhangsan 23 1 100
3 wangwu 25 3 85
null null null 4 96

3.4. full join

见附录:表1

select stu.id,stu.name,stu.age,score.id,score.score
from stu full join score 
on stu.id = score.id;

结果如下:

stu.id stu.name stu.age score.id score.score
1 zhangsan 23 1 100
2 lisi 24 null null
3 wangwu 25 3 85
null null null 4 96

3.5. 多表连接

见附录:表1

select stu.id,stu.name,stu.age,score.id,score.score,location.id,location.name,location.loc
from stu join score
on stu.id = score.id
join location
on score.id = location.id;

结果如下:

stu.id stu.name stu.age score.id score.score location.id location.name location.loc
1 zhangsan 23 1 100 1 zhangsan nanjing

3.6. 产生笛卡尔积

见附录:表1

  1. 省略连接条件

    select stu.id,stu.name,stu.age,score.id,score.score
    from stu,score;
    
  2. 连接条件无效

  3. 所有表中的所有行互相连接

效果如下:

stu.id stu.name stu.age score.id score.score
1 zhangsan 23 1 100
1 zhangsan 23 4 96
1 zhangsan 23 3 85
2 lisi 24 1 100
2 lisi 24 4 96
2 lisi 24 3 85
3 wangwu 25 1 100
3 wangwu 25 4 96
3 wangwu 25 3 85

4. having 和 where

lesson表见附录:表2

create external table lesson(stu_id int,lesson_name string,grade int)
row format delimited fields terminated by '\t';

load data local inpath '/opt/module/data/lesson' into table lesson;

where后面不能用聚合函数,having后面可以

select stu_id, avg(grade) from lesson where avg(grade>80) group by stu_id;
select stu_id, avg(grade) from lesson group by stu_id having avg(grade)>80;

5. order/sort/distribute

案例使用附录:表2

5.1. order by

set mapreduce.job.reduces = 4;
select * from lesson order by grade;

结果如下:只产生一个reducer,并且全局有序

lesson.stu_id lesson.lesson_name lesson.grade
2 yingyu 60
2 shuxue 78
3 shuxue 87
1 yingyu 88
2 yuwen 89
1 shuxue 98
3 yuwen 99
1 yuwen 100
3 yingyu 100

5.2. sort by

set mapreduce.job.reduces = 4;
select * from lesson sort by grade;

结果如下:产生了4个reducer,并且4个分区内有序

lesson.stu_id lesson.lesson_name lesson.grade
3 shuxue 87
2 yuwen 89
2 yingyu 60
1 shuxue 98
2 shuxue 78
1 yingyu 88
1 yuwen 100
3 yuwen 99
3 yingyu 100

5.3. distribute by

set mapreduce.job.reduces = 2;
select * from lesson distribute by stu_id sort by grade;

结果如下:产生了2个reducer并且只划分了两个区

lesson.stu_id lesson.lesson_name lesson.grade
2 yingyu 60
2 shuxue 78
2 yuwen 89
3 shuxue 87
1 yingyu 88
1 shuxue 98
3 yuwen 99
1 yuwen 100
3 yingyu 100
set mapreduce.job.reduces = 3;
select * from lesson distribute by stu_id sort by grade;

结果如下:产生了3个reducer并且划分了三个

lesson.stu_id lesson.lesson_name lesson.grade
3 shuxue 87
3 yuwen 99
3 yingyu 100
1 yingyu 88
1 shuxue 98
1 yuwen 100
2 yingyu 60
2 shuxue 78
2 yuwen 89

6. 分桶表

6.1. 创建分桶表

见附录:表3

create table user_buck(id int , name string)
clustered by(id)
into 4 buckets
row format delimited fields terminated by '\t';

6.2. 向分桶表中导入数据

set hive.enforce.bucketing=true;
set mapreduce.job.reduces=-1;

create table if not exists `user`(id int , name string)
row format delimited fields terminated by '\t';

load data local inpath '/opt/module/data/user' into table `user`;

insert into user_buck
select id,name from `user`;

6.3. 抽样查看分桶表

select * from user_buck tablesample(bucket 1 out of 3 on id);

结果如下:

user_buck.id user_buck.name
9 I
6 F
3 C

7. 常用函数

7.1. 行转列

concat、concat_ws、collect_set

见附录:表4

要求:把星座和血型一样的人归类到一起。结果如下: 射手座,A 傻僧|龙哥 白羊座,A 孙悟空|猪八戒 白羊座,B 羊纠结

create table if not exsits person_info(
  name string,
  constellation string,
  blood_type string
  )
row format delimited fields terminated by "\t";

load data local inpath "/opt/module/data/constellation.txt" into table person_info;


select 
  t1.properties properties,
  concat_ws('|',collect_set(t1.name)) names
from
  (select name,concat(constellation,',',blood_type) properties
   from person_info
  ) t1
group by t1.properties;

结果如下:

properties names
射手座,A 傻僧|龙哥
白羊座,A 孙悟空|猪八戒
白羊座,B 羊纠结

7.2 列转行

explode、lateral view

见附录:表5

key values 字母 [“a”,”b”,”c”,”d”,”e”] 数字 [“1”,”2”,”3”,”4”,”5”]

要求: 把行炸开成 key value 字母 A 字母 B 字母 C 字母 D 字母 E 数字 1 数字 2 数字 3

数字 4

数字 5

create table if not exists collect(
  key string,
  values array<String>)
row format delimited fields terminated by "\t"
collection items terminated by ",";

load data local inpath "/opt/module/data/collect" into table collect;

select key, value
from collect lateral view explode(`values`) t1 as value;

结果如要求所示。

7.3. 日期函数

date_format(‘2020-01-01’,’yyyy-MM-dd’)
如果是 ‘2020/01/01’就不能直接使用date_format函数,要使用regexp_replace(‘2020/01/01’,’/‘,’-‘)将’/‘替换成’-‘。

datediff(‘2020-10-01’,’2020-01-04’)
date_add(‘2020-01-01’,100)
date_sub(‘2020-10-01’,100)
next_day(‘2020-02-01’,’Monday’) —— 2020年2月1日的下一个周一
last_day(‘2020-02-01’) —— 2020年2月的最后一天

8.窗口函数 over()

见附录:表6

要求: (1)查询在2017年4月份购买过的顾客及总人数

(2)查询顾客的购买明细及月购买总额

(3)上述的场景,要将cost按照日期进行累加

(4)查询顾客上次的购买时间

(5)查询前20%时间的订单信息

create table business (
  name string,
  orderdate string,
  cost int
  )
row format delimited fields terminated by ',';

load data local inpath '/opt/module/data/business' into table business;
  1. 查询在2017年4月份购买过的顾客及总人数
    select name,count(*) over() totalcustomer
    from business
    where substr(orderdate,1,7)='2017-04'
    group by name;
    
    结果如下:

    name totalcustomer jack 2 mart 2

  1. 查询顾客的购买明细及顾客的月购买总额
    select name,cost,orderdate,
    sum(cost) over(distribute by substr(orderdate,1,7),name) whos_total_cost
    from business;
    
    结果如下

    name cost orderdate whos_total_cost

jack 10 2017-01-01 111

jack 46 2017-01-05 111

jack 55 2017-01-08 111

tony 29 2017-01-04 94

tony 50 2017-01-07 94

tony 15 2017-01-02 94

jack 23 2017-02-03 23

jack 42 2017-04-06 42

mart 94 2017-04-13 299

mart 62 2017-04-08 299

mart 68 2017-04-09 299

mart 75 2017-04-11 299

neil 12 2017-05-10 12

neil 80 2017-06-12 80

  1. 将所有顾客的购买额按照日期进行累加

    select name,orderdate,cost,sum(cost) over(order by orderdate)
    from business
    order by orderdate;
    
  2. 查询顾客的月购买总额,将购买额cost按日期进行累加

    select name,
    sum(cost) over(distribute by substr(orderdate,1,7),name sort by orderdate) costadd,
    orderdate,whos_total_cost
    from(
    select name,cost,orderdate,
    sum(cost) over(distribute by substr(orderdate,1,7),name) whos_total_cost
    from business
    )t1;
    

9. rank()

RANK() 排序相同时会重复,总数不会变:1134
DENSE_RANK() 排序相同时会重复,总数会减少:1123
ROW_NUMBER() 会根据顺序计算:1234

需求:对于表6,求每个人消费最高的两条数据;

select  name,
        orderdate,
        cost
from(   
    select  name,
            orderdate,
            cost,
            row_number() over(partition by name order by cost desc) rank
    from business
)t1
where rank <= 2;

查询结果:

name orderdate cost

jack 2017-01-08 55

jack 2017-01-05 46

mart 2017-04-13 94

mart 2017-04-11 75

neil 2017-06-12 80

neil 2017-05-10 12

tony 2017-01-07 50

tony 2017-01-04 29

10. 分组求前N

见附录:表7
需求:求每个hisgroup中age最大的两个人的信息。

select 
id,
age,
hisgroup
from(
    select
    id,
    age,
    hisgroup,
    row_number() over(partition by hisgroup order by age desc) rank
    from id_age_group
)t1
where rank<=2;

功能语句

  1. 查看系统自带的函数

    hive> show functions;

  2. 显示自带函数的用法

    hive> desc function upper

  3. 详细显示自带函数的用法

    hive> desc function extended upper;


附录:表

表1:stu、score、location

表名:stu 表名:score 表名:location
id name age id score id name loc
1 zhangsan 23 1 100 1 zhangsan nanjing
2 lisi 24 3 85 2 lisi beijing
3 wangwu 25 4 96 4 zhaoliu dongjing

表2:lesson

表名:lesson
stu_id lesson_name grade
1 yuwen 100
1 shuxue 98
1 yingyu 88
2 yuwen 89
2 shuxue 78
2 yingyu 60
3 yuwen 99
3 shuxue 87
3 yingyu 100

表3:user、user_buck

表名:user/user_buck
id name
1 A
2 B
3 C
4 D
5 E
6 F
7 G
8 H
9 I

表4:person_info

表名:person_info
name constellation blood_type
孙悟空 白羊座 A
傻僧 射手座 A
猪八戒 白羊座 A
羊纠结 白羊座 B
龙哥 射手座 A

表5:collect

表名:collect
key values
数字 [“1”,”2”,”3”,”4”,”5”]
字母 [“a”,”b”,”c”,”d”,”e”]

表6:business

表名:business
name orderdate cost
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94

表7:id_age_hisgroup

表名:id_age_hisgroup
id age hisgroup
1 100 b
2 933 b
3 23 c
4 32 a
5 231 c
6 22 c
7 12 b
8 1 a
9 200 a
10 302 a
11 98 b
12 71 c