1. 建表语句
- 1.1. 创建外部分区表并指定压缩
- 1.2. 创建外部表并声明文件分隔符
2. 向表导入数据
3. Join
4. having 和 where
5. order/sort/distribute
6. 分桶表
7. 常用函数
8.窗口函数 over()
9. rank()
10. 分组求前N
功能语句
附录：表

1. 建表语句

1.1. 创建外部分区表并指定压缩

drop table if exists dwd_base_event_log;
create external table dwd_base_event_log(
  `mid_id` string,
  `user_id` string,
  ...,
  `extend1` string
)
partition by (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_base_event_log/';

1.2. 创建外部表并声明文件分隔符

drop table if exists dwd_base_event_log;
create external table location(
    id int,
  name string,
  loc string
)
row format delimited fields terminated by '\t';

2. 向表导入数据

2.1. 从HDFS导入数据到表中

load data inpath '/origin_data/gmall/log/topic_event/2020-01-01' 
into table gmall.ods_event_log partition(dt='2020-01-01');

2.2. 从本地导入数据到表中

load data local inpath '/opt/module/data/location'
into table location;

2.2. 静态分区，从其他表中拉取数据

(设置非严格模式)

set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table dwd_start_log
partition (dt = "2020-01-01")
select 
    get_json_object(line,'$.uid') mid_id,
  ...,
  get_json_object(line,'$.extend1') extend1
from ods_start_log
where dt='2020-01-01';

2.3. 动态分区，从其他表中拉取数据

(设置非严格模式、动态分区)

set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;

insert overwrite table dwd_start_log
partition (dt)
select 
    get_json_object(line,'$.uid') mid_id,
  ...,
  get_json_object(line,'$.extend1') extend1
from ods_start_log
where dt='2020-01-01';

3. Join

3.1. join

见附录：表1

select stu.id,stu.name,stu.age,location.id,location.name,location.loc
from stu join location
on stu.id = location.id and stu.name=location.name;

结果如下：

stu.id	stu.name	stu.age	location.id	location.name	location.loc
1	zhangsan	23	1	zhangsan	nanjing
2	lisi	24	2	lisi	beijing

3.2. left join

见附录：表1

select stu.id,stu.name,stu.age,score.id,score.score 
from stu left join score 
on stu.id = score.id;

结果如下：

stu.id	stu.name	stu.age	score.id	score.score
1	zhangsan	23	1	100
2	lisi	24	null	null
3	wangwu	25	3	85

3.3. right join

见附录：表1

select stu.id,stu.name,stu.age,score.id,score.score 
from stu right join score 
on stu.id = score.id;

结果如下：

stu.id	stu.name	stu.age	score.id	score.score
1	zhangsan	23	1	100
3	wangwu	25	3	85
null	null	null	4	96

3.4. full join

见附录：表1

select stu.id,stu.name,stu.age,score.id,score.score
from stu full join score 
on stu.id = score.id;

结果如下：

stu.id	stu.name	stu.age	score.id	score.score
1	zhangsan	23	1	100
2	lisi	24	null	null
3	wangwu	25	3	85
null	null	null	4	96

3.5. 多表连接

见附录：表1

select stu.id,stu.name,stu.age,score.id,score.score,location.id,location.name,location.loc
from stu join score
on stu.id = score.id
join location
on score.id = location.id;

结果如下：

stu.id	stu.name	stu.age	score.id	score.score	location.id	location.name	location.loc
1	zhangsan	23	1	100	1	zhangsan	nanjing

3.6. 产生笛卡尔积

见附录：表1

省略连接条件

select stu.id,stu.name,stu.age,score.id,score.score
from stu,score;

连接条件无效
所有表中的所有行互相连接

效果如下：

stu.id	stu.name	stu.age	score.id	score.score
1	zhangsan	23	1	100
1	zhangsan	23	4	96
1	zhangsan	23	3	85
2	lisi	24	1	100
2	lisi	24	4	96
2	lisi	24	3	85
3	wangwu	25	1	100
3	wangwu	25	4	96
3	wangwu	25	3	85

4. having 和 where

lesson表见附录：表2

create external table lesson(stu_id int,lesson_name string,grade int)
row format delimited fields terminated by '\t';

load data local inpath '/opt/module/data/lesson' into table lesson;

where后面不能用聚合函数，having后面可以

select stu_id, avg(grade) from lesson where avg(grade>80) group by stu_id;
select stu_id, avg(grade) from lesson group by stu_id having avg(grade)>80;

5. order/sort/distribute

案例使用附录：表2

5.1. order by

set mapreduce.job.reduces = 4;
select * from lesson order by grade;

结果如下：只产生一个reducer，并且全局有序

lesson.stu_id	lesson.lesson_name	lesson.grade
2	yingyu	60
2	shuxue	78
3	shuxue	87
1	yingyu	88
2	yuwen	89
1	shuxue	98
3	yuwen	99
1	yuwen	100
3	yingyu	100

5.2. sort by

set mapreduce.job.reduces = 4;
select * from lesson sort by grade;

结果如下：产生了4个reducer，并且4个分区内有序

lesson.stu_id	lesson.lesson_name	lesson.grade
3	shuxue	87
2	yuwen	89
2	yingyu	60
1	shuxue	98
2	shuxue	78
1	yingyu	88
1	yuwen	100
3	yuwen	99
3	yingyu	100

5.3. distribute by

set mapreduce.job.reduces = 2;
select * from lesson distribute by stu_id sort by grade;

结果如下：产生了2个reducer并且只划分了两个区

lesson.stu_id	lesson.lesson_name	lesson.grade
2	yingyu	60
2	shuxue	78
2	yuwen	89
3	shuxue	87
1	yingyu	88
1	shuxue	98
3	yuwen	99
1	yuwen	100
3	yingyu	100

set mapreduce.job.reduces = 3;
select * from lesson distribute by stu_id sort by grade;

结果如下：产生了3个reducer并且划分了三个

lesson.stu_id	lesson.lesson_name	lesson.grade
3	shuxue	87
3	yuwen	99
3	yingyu	100
1	yingyu	88
1	shuxue	98
1	yuwen	100
2	yingyu	60
2	shuxue	78
2	yuwen	89

6. 分桶表

6.1. 创建分桶表

见附录：表3

create table user_buck(id int , name string)
clustered by(id)
into 4 buckets
row format delimited fields terminated by '\t';

6.2. 向分桶表中导入数据

set hive.enforce.bucketing=true;
set mapreduce.job.reduces=-1;

create table if not exists `user`(id int , name string)
row format delimited fields terminated by '\t';

load data local inpath '/opt/module/data/user' into table `user`;

insert into user_buck
select id,name from `user`;

6.3. 抽样查看分桶表

select * from user_buck tablesample(bucket 1 out of 3 on id);

结果如下：

user_buck.id	user_buck.name
9	I
6	F
3	C

7. 常用函数

7.1. 行转列

concat、concat_ws、collect_set

见附录：表4

要求：把星座和血型一样的人归类到一起。结果如下：射手座,A 傻僧|龙哥白羊座,A 孙悟空|猪八戒白羊座,B 羊纠结

create table if not exsits person_info(
  name string,
  constellation string,
  blood_type string
  )
row format delimited fields terminated by "\t";

load data local inpath "/opt/module/data/constellation.txt" into table person_info;


select 
  t1.properties properties,
  concat_ws('|',collect_set(t1.name)) names
from
  (select name,concat(constellation,',',blood_type) properties
   from person_info
  ) t1
group by t1.properties;

结果如下：

properties	names
射手座,A	傻僧\|龙哥
白羊座,A	孙悟空\|猪八戒
白羊座,B	羊纠结

7.2 列转行

explode、lateral view

见附录：表5

key values 字母 [“a”,”b”,”c”,”d”,”e”] 数字 [“1”,”2”,”3”,”4”,”5”]

要求：把行炸开成 key value 字母 A 字母 B 字母 C 字母 D 字母 E 数字 1 数字 2 数字 3

数字 4

数字 5

create table if not exists collect(
  key string,
  values array<String>)
row format delimited fields terminated by "\t"
collection items terminated by ",";

load data local inpath "/opt/module/data/collect" into table collect;

select key, value
from collect lateral view explode(`values`) t1 as value;

结果如要求所示。

7.3. 日期函数

date_format(‘2020-01-01’,’yyyy-MM-dd’)
如果是 ‘2020/01/01’就不能直接使用date_format函数，要使用regexp_replace(‘2020/01/01’,’/‘,’-‘)将’/‘替换成’-‘。

datediff(‘2020-10-01’,’2020-01-04’)
date_add(‘2020-01-01’,100)
date_sub(‘2020-10-01’,100)
next_day(‘2020-02-01’,’Monday’) —— 2020年2月1日的下一个周一
last_day(‘2020-02-01’) —— 2020年2月的最后一天

8.窗口函数 over()

见附录：表6

要求：（1）查询在2017年4月份购买过的顾客及总人数

（2）查询顾客的购买明细及月购买总额

（3）上述的场景,要将cost按照日期进行累加

（4）查询顾客上次的购买时间

（5）查询前20%时间的订单信息

create table business (
  name string,
  orderdate string,
  cost int
  )
row format delimited fields terminated by ',';

load data local inpath '/opt/module/data/business' into table business;

查询在2017年4月份购买过的顾客及总人数
```
select name,count(*) over() totalcustomer
from business
where substr(orderdate,1,7)='2017-04'
group by name;
```
结果如下：

name totalcustomer jack 2 mart 2

查询顾客的购买明细及顾客的月购买总额

select name,cost,orderdate,
sum(cost) over(distribute by substr(orderdate,1,7),name) whos_total_cost
from business;

结果如下

name cost orderdate whos_total_cost

jack 10 2017-01-01 111

jack 46 2017-01-05 111

jack 55 2017-01-08 111

tony 29 2017-01-04 94

tony 50 2017-01-07 94

tony 15 2017-01-02 94

jack 23 2017-02-03 23

jack 42 2017-04-06 42

mart 94 2017-04-13 299

mart 62 2017-04-08 299

mart 68 2017-04-09 299

mart 75 2017-04-11 299

neil 12 2017-05-10 12

neil 80 2017-06-12 80

将所有顾客的购买额按照日期进行累加

select name,orderdate,cost,sum(cost) over(order by orderdate)
from business
order by orderdate;

查询顾客的月购买总额，将购买额cost按日期进行累加

select name,
sum(cost) over(distribute by substr(orderdate,1,7),name sort by orderdate) costadd,
orderdate,whos_total_cost
from(
select name,cost,orderdate,
sum(cost) over(distribute by substr(orderdate,1,7),name) whos_total_cost
from business
)t1;

9. rank()

RANK() 排序相同时会重复，总数不会变：1134
DENSE_RANK() 排序相同时会重复，总数会减少：1123
ROW_NUMBER() 会根据顺序计算：1234

需求：对于表6，求每个人消费最高的两条数据；

select  name,
        orderdate,
        cost
from(   
    select  name,
            orderdate,
            cost,
            row_number() over(partition by name order by cost desc) rank
    from business
)t1
where rank <= 2;

查询结果：

name orderdate cost

jack 2017-01-08 55

jack 2017-01-05 46

mart 2017-04-13 94

mart 2017-04-11 75

neil 2017-06-12 80

neil 2017-05-10 12

tony 2017-01-07 50

tony 2017-01-04 29

10. 分组求前N

见附录：表7
需求：求每个hisgroup中age最大的两个人的信息。

select 
id,
age,
hisgroup
from(
    select
    id,
    age,
    hisgroup,
    row_number() over(partition by hisgroup order by age desc) rank
    from id_age_group
)t1
where rank<=2;

功能语句

查看系统自带的函数

hive> show functions;
显示自带函数的用法

hive> desc function upper
详细显示自带函数的用法

hive> desc function extended upper;

附录：表

表1：stu、score、location

表名：stu			表名：score		表名：location
id	name	age	id	score	id	name	loc
1	zhangsan	23	1	100	1	zhangsan	nanjing
2	lisi	24	3	85	2	lisi	beijing
3	wangwu	25	4	96	4	zhaoliu	dongjing

表2：lesson

表名：lesson
stu_id	lesson_name	grade
1	yuwen	100
1	shuxue	98
1	yingyu	88
2	yuwen	89
2	shuxue	78
2	yingyu	60
3	yuwen	99
3	shuxue	87
3	yingyu	100

表3：user、user_buck

表名：user/user_buck
id	name
1	A
2	B
3	C
4	D
5	E
6	F
7	G
8	H
9	I

表4：person_info

表名：person_info
name	constellation	blood_type
孙悟空	白羊座	A
傻僧	射手座	A
猪八戒	白羊座	A
羊纠结	白羊座	B
龙哥	射手座	A

表5：collect

表名：collect
key	values
数字	[“1”,”2”,”3”,”4”,”5”]
字母	[“a”,”b”,”c”,”d”,”e”]

表6：business

表名：business
name	orderdate	cost
jack,2017-01-01,10 tony,2017-01-02,15 jack,2017-02-03,23 tony,2017-01-04,29 jack,2017-01-05,46 jack,2017-04-06,42 tony,2017-01-07,50 jack,2017-01-08,55 mart,2017-04-08,62 mart,2017-04-09,68 neil,2017-05-10,12 mart,2017-04-11,75 neil,2017-06-12,80 mart,2017-04-13,94

表7：id_age_hisgroup

表名：id_age_hisgroup
id	age	hisgroup
1 100 b 2 933 b 3 23 c 4 32 a 5 231 c 6 22 c 7 12 b 8 1 a 9 200 a 10 302 a 11 98 b 12 71 c

Hive语句示例

1. 建表语句

1.1. 创建外部分区表并指定压缩

1.2. 创建外部表并声明文件分隔符

2. 向表导入数据

2.1. 从HDFS导入数据到表中

2.2. 从本地导入数据到表中

2.2. 静态分区，从其他表中拉取数据

2.3. 动态分区，从其他表中拉取数据

3. Join

3.1. join

3.2. left join

3.3. right join

3.4. full join

3.5. 多表连接

3.6. 产生笛卡尔积

4. having 和 where

5. order/sort/distribute

5.1. order by

5.2. sort by

5.3. distribute by

6. 分桶表

6.1. 创建分桶表

6.2. 向分桶表中导入数据

6.3. 抽样查看分桶表

7. 常用函数

7.1. 行转列

concat、concat_ws、collect_set

7.2 列转行

explode、lateral view

7.3. 日期函数

8.窗口函数 over()

9. rank()

10. 分组求前N

功能语句

附录：表

表1：stu、score、location

表2：lesson

表3：user、user_buck

表4：person_info

表5：collect

表6：business

表7：id_age_hisgroup