1. 建表语句
1.1. 创建外部分区表并指定压缩
drop table if exists dwd_base_event_log;
create external table dwd_base_event_log(
`mid_id` string,
`user_id` string,
...,
`extend1` string
)
partition by (dt string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_base_event_log/';
1.2. 创建外部表并声明文件分隔符
drop table if exists dwd_base_event_log;
create external table location(
id int,
name string,
loc string
)
row format delimited fields terminated by '\t';
2. 向表导入数据
2.1. 从HDFS导入数据到表中
load data inpath '/origin_data/gmall/log/topic_event/2020-01-01'
into table gmall.ods_event_log partition(dt='2020-01-01');
2.2. 从本地导入数据到表中
load data local inpath '/opt/module/data/location'
into table location;
2.2. 静态分区,从其他表中拉取数据
(设置非严格模式)
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_start_log
partition (dt = "2020-01-01")
select
get_json_object(line,'$.uid') mid_id,
...,
get_json_object(line,'$.extend1') extend1
from ods_start_log
where dt='2020-01-01';
2.3. 动态分区,从其他表中拉取数据
(设置非严格模式、动态分区)
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
insert overwrite table dwd_start_log
partition (dt)
select
get_json_object(line,'$.uid') mid_id,
...,
get_json_object(line,'$.extend1') extend1
from ods_start_log
where dt='2020-01-01';
3. Join
3.1. join
见附录:表1
select stu.id,stu.name,stu.age,location.id,location.name,location.loc
from stu join location
on stu.id = location.id and stu.name=location.name;
结果如下:
stu.id | stu.name | stu.age | location.id | location.name | location.loc |
---|---|---|---|---|---|
1 | zhangsan | 23 | 1 | zhangsan | nanjing |
2 | lisi | 24 | 2 | lisi | beijing |
3.2. left join
见附录:表1
select stu.id,stu.name,stu.age,score.id,score.score
from stu left join score
on stu.id = score.id;
结果如下:
stu.id | stu.name | stu.age | score.id | score.score |
---|---|---|---|---|
1 | zhangsan | 23 | 1 | 100 |
2 | lisi | 24 | null | null |
3 | wangwu | 25 | 3 | 85 |
3.3. right join
见附录:表1
select stu.id,stu.name,stu.age,score.id,score.score
from stu right join score
on stu.id = score.id;
结果如下:
stu.id | stu.name | stu.age | score.id | score.score |
---|---|---|---|---|
1 | zhangsan | 23 | 1 | 100 |
3 | wangwu | 25 | 3 | 85 |
null | null | null | 4 | 96 |
3.4. full join
见附录:表1
select stu.id,stu.name,stu.age,score.id,score.score
from stu full join score
on stu.id = score.id;
结果如下:
stu.id | stu.name | stu.age | score.id | score.score |
---|---|---|---|---|
1 | zhangsan | 23 | 1 | 100 |
2 | lisi | 24 | null | null |
3 | wangwu | 25 | 3 | 85 |
null | null | null | 4 | 96 |
3.5. 多表连接
见附录:表1
select stu.id,stu.name,stu.age,score.id,score.score,location.id,location.name,location.loc
from stu join score
on stu.id = score.id
join location
on score.id = location.id;
结果如下:
stu.id | stu.name | stu.age | score.id | score.score | location.id | location.name | location.loc |
---|---|---|---|---|---|---|---|
1 | zhangsan | 23 | 1 | 100 | 1 | zhangsan | nanjing |
3.6. 产生笛卡尔积
见附录:表1
省略连接条件
select stu.id,stu.name,stu.age,score.id,score.score from stu,score;
连接条件无效
- 所有表中的所有行互相连接
效果如下:
stu.id | stu.name | stu.age | score.id | score.score |
---|---|---|---|---|
1 | zhangsan | 23 | 1 | 100 |
1 | zhangsan | 23 | 4 | 96 |
1 | zhangsan | 23 | 3 | 85 |
2 | lisi | 24 | 1 | 100 |
2 | lisi | 24 | 4 | 96 |
2 | lisi | 24 | 3 | 85 |
3 | wangwu | 25 | 1 | 100 |
3 | wangwu | 25 | 4 | 96 |
3 | wangwu | 25 | 3 | 85 |
4. having 和 where
lesson表见附录:表2
create external table lesson(stu_id int,lesson_name string,grade int)
row format delimited fields terminated by '\t';
load data local inpath '/opt/module/data/lesson' into table lesson;
where后面不能用聚合函数,having后面可以
select stu_id, avg(grade) from lesson where avg(grade>80) group by stu_id;
select stu_id, avg(grade) from lesson group by stu_id having avg(grade)>80;
5. order/sort/distribute
5.1. order by
set mapreduce.job.reduces = 4;
select * from lesson order by grade;
结果如下:只产生一个reducer,并且全局有序
lesson.stu_id | lesson.lesson_name | lesson.grade |
---|---|---|
2 | yingyu | 60 |
2 | shuxue | 78 |
3 | shuxue | 87 |
1 | yingyu | 88 |
2 | yuwen | 89 |
1 | shuxue | 98 |
3 | yuwen | 99 |
1 | yuwen | 100 |
3 | yingyu | 100 |
5.2. sort by
set mapreduce.job.reduces = 4;
select * from lesson sort by grade;
结果如下:产生了4个reducer,并且4个分区内有序
lesson.stu_id | lesson.lesson_name | lesson.grade |
---|---|---|
3 | shuxue | 87 |
2 | yuwen | 89 |
2 | yingyu | 60 |
1 | shuxue | 98 |
2 | shuxue | 78 |
1 | yingyu | 88 |
1 | yuwen | 100 |
3 | yuwen | 99 |
3 | yingyu | 100 |
5.3. distribute by
set mapreduce.job.reduces = 2;
select * from lesson distribute by stu_id sort by grade;
结果如下:产生了2个reducer并且只划分了两个区
lesson.stu_id | lesson.lesson_name | lesson.grade |
---|---|---|
2 | yingyu | 60 |
2 | shuxue | 78 |
2 | yuwen | 89 |
3 | shuxue | 87 |
1 | yingyu | 88 |
1 | shuxue | 98 |
3 | yuwen | 99 |
1 | yuwen | 100 |
3 | yingyu | 100 |
set mapreduce.job.reduces = 3;
select * from lesson distribute by stu_id sort by grade;
结果如下:产生了3个reducer并且划分了三个
lesson.stu_id | lesson.lesson_name | lesson.grade |
---|---|---|
3 | shuxue | 87 |
3 | yuwen | 99 |
3 | yingyu | 100 |
1 | yingyu | 88 |
1 | shuxue | 98 |
1 | yuwen | 100 |
2 | yingyu | 60 |
2 | shuxue | 78 |
2 | yuwen | 89 |
6. 分桶表
6.1. 创建分桶表
见附录:表3
create table user_buck(id int , name string)
clustered by(id)
into 4 buckets
row format delimited fields terminated by '\t';
6.2. 向分桶表中导入数据
set hive.enforce.bucketing=true;
set mapreduce.job.reduces=-1;
create table if not exists `user`(id int , name string)
row format delimited fields terminated by '\t';
load data local inpath '/opt/module/data/user' into table `user`;
insert into user_buck
select id,name from `user`;
6.3. 抽样查看分桶表
select * from user_buck tablesample(bucket 1 out of 3 on id);
结果如下:
user_buck.id | user_buck.name |
---|---|
9 | I |
6 | F |
3 | C |
7. 常用函数
7.1. 行转列
concat、concat_ws、collect_set
见附录:表4
要求:把星座和血型一样的人归类到一起。结果如下: 射手座,A 傻僧|龙哥 白羊座,A 孙悟空|猪八戒 白羊座,B 羊纠结
create table if not exsits person_info(
name string,
constellation string,
blood_type string
)
row format delimited fields terminated by "\t";
load data local inpath "/opt/module/data/constellation.txt" into table person_info;
select
t1.properties properties,
concat_ws('|',collect_set(t1.name)) names
from
(select name,concat(constellation,',',blood_type) properties
from person_info
) t1
group by t1.properties;
结果如下:
properties | names |
---|---|
射手座,A | 傻僧|龙哥 |
白羊座,A | 孙悟空|猪八戒 |
白羊座,B | 羊纠结 |
7.2 列转行
explode、lateral view
见附录:表5
key values 字母 [“a”,”b”,”c”,”d”,”e”] 数字 [“1”,”2”,”3”,”4”,”5”]
要求: 把行炸开成 key value 字母 A 字母 B 字母 C 字母 D 字母 E 数字 1 数字 2 数字 3
数字 4
数字 5
create table if not exists collect(
key string,
values array<String>)
row format delimited fields terminated by "\t"
collection items terminated by ",";
load data local inpath "/opt/module/data/collect" into table collect;
select key, value
from collect lateral view explode(`values`) t1 as value;
结果如要求所示。
7.3. 日期函数
date_format(‘2020-01-01’,’yyyy-MM-dd’)
如果是 ‘2020/01/01’就不能直接使用date_format函数,要使用regexp_replace(‘2020/01/01’,’/‘,’-‘)将’/‘替换成’-‘。
datediff(‘2020-10-01’,’2020-01-04’)
date_add(‘2020-01-01’,100)
date_sub(‘2020-10-01’,100)
next_day(‘2020-02-01’,’Monday’) —— 2020年2月1日的下一个周一
last_day(‘2020-02-01’) —— 2020年2月的最后一天
8.窗口函数 over()
见附录:表6
要求: (1)查询在2017年4月份购买过的顾客及总人数
(2)查询顾客的购买明细及月购买总额
(3)上述的场景,要将cost按照日期进行累加
(4)查询顾客上次的购买时间
(5)查询前20%时间的订单信息
create table business (
name string,
orderdate string,
cost int
)
row format delimited fields terminated by ',';
load data local inpath '/opt/module/data/business' into table business;
- 查询在2017年4月份购买过的顾客及总人数
结果如下:select name,count(*) over() totalcustomer from business where substr(orderdate,1,7)='2017-04' group by name;
name totalcustomer jack 2 mart 2
- 查询顾客的购买明细及顾客的月购买总额
结果如下select name,cost,orderdate, sum(cost) over(distribute by substr(orderdate,1,7),name) whos_total_cost from business;
name cost orderdate whos_total_cost
jack 10 2017-01-01 111
jack 46 2017-01-05 111
jack 55 2017-01-08 111
tony 29 2017-01-04 94
tony 50 2017-01-07 94
tony 15 2017-01-02 94
jack 23 2017-02-03 23
jack 42 2017-04-06 42
mart 94 2017-04-13 299
mart 62 2017-04-08 299
mart 68 2017-04-09 299
mart 75 2017-04-11 299
neil 12 2017-05-10 12
neil 80 2017-06-12 80
将所有顾客的购买额按照日期进行累加
select name,orderdate,cost,sum(cost) over(order by orderdate) from business order by orderdate;
查询顾客的月购买总额,将购买额cost按日期进行累加
select name, sum(cost) over(distribute by substr(orderdate,1,7),name sort by orderdate) costadd, orderdate,whos_total_cost from( select name,cost,orderdate, sum(cost) over(distribute by substr(orderdate,1,7),name) whos_total_cost from business )t1;
9. rank()
RANK() 排序相同时会重复,总数不会变:1134
DENSE_RANK() 排序相同时会重复,总数会减少:1123
ROW_NUMBER() 会根据顺序计算:1234
需求:对于表6,求每个人消费最高的两条数据;
select name,
orderdate,
cost
from(
select name,
orderdate,
cost,
row_number() over(partition by name order by cost desc) rank
from business
)t1
where rank <= 2;
查询结果:
name orderdate cost
jack 2017-01-08 55
jack 2017-01-05 46
mart 2017-04-13 94
mart 2017-04-11 75
neil 2017-06-12 80
neil 2017-05-10 12
tony 2017-01-07 50
tony 2017-01-04 29
10. 分组求前N
见附录:表7
需求:求每个hisgroup中age最大的两个人的信息。
select
id,
age,
hisgroup
from(
select
id,
age,
hisgroup,
row_number() over(partition by hisgroup order by age desc) rank
from id_age_group
)t1
where rank<=2;
功能语句
查看系统自带的函数
hive> show functions;
显示自带函数的用法
hive> desc function upper
详细显示自带函数的用法
hive> desc function extended upper;
附录:表
表1:stu、score、location
表名:stu | 表名:score | 表名:location | |||||||
---|---|---|---|---|---|---|---|---|---|
id | name | age | id | score | id | name | loc | ||
1 | zhangsan | 23 | 1 | 100 | 1 | zhangsan | nanjing | ||
2 | lisi | 24 | 3 | 85 | 2 | lisi | beijing | ||
3 | wangwu | 25 | 4 | 96 | 4 | zhaoliu | dongjing |
表2:lesson
表名:lesson | ||
---|---|---|
stu_id | lesson_name | grade |
1 | yuwen | 100 |
1 | shuxue | 98 |
1 | yingyu | 88 |
2 | yuwen | 89 |
2 | shuxue | 78 |
2 | yingyu | 60 |
3 | yuwen | 99 |
3 | shuxue | 87 |
3 | yingyu | 100 |
表3:user、user_buck
表名:user/user_buck | |
---|---|
id | name |
1 | A |
2 | B |
3 | C |
4 | D |
5 | E |
6 | F |
7 | G |
8 | H |
9 | I |
表4:person_info
表名:person_info | ||
---|---|---|
name | constellation | blood_type |
孙悟空 | 白羊座 | A |
傻僧 | 射手座 | A |
猪八戒 | 白羊座 | A |
羊纠结 | 白羊座 | B |
龙哥 | 射手座 | A |
表5:collect
表名:collect | |
---|---|
key | values |
数字 | [“1”,”2”,”3”,”4”,”5”] |
字母 | [“a”,”b”,”c”,”d”,”e”] |
表6:business
表名:business | ||
---|---|---|
name | orderdate | cost |
jack,2017-01-01,10 tony,2017-01-02,15 jack,2017-02-03,23 tony,2017-01-04,29 jack,2017-01-05,46 jack,2017-04-06,42 tony,2017-01-07,50 jack,2017-01-08,55 mart,2017-04-08,62 mart,2017-04-09,68 neil,2017-05-10,12 mart,2017-04-11,75 neil,2017-06-12,80 mart,2017-04-13,94 |
表7:id_age_hisgroup
表名:id_age_hisgroup | ||
---|---|---|
id | age | hisgroup |
1 100 b 2 933 b 3 23 c 4 32 a 5 231 c 6 22 c 7 12 b 8 1 a 9 200 a 10 302 a 11 98 b 12 71 c |