乱七八糟
round(double a, int d) 取整,保留小数点后多少位 四舍五入floor 向下取整ceil 向上取整rand(int seed) 取随机数### 拼接concat(string A, string B…) concat(‘abc’,'def’,'gh’) 》》》abcdefghconcat_ws(string SEP, string A, string B…) concat_ws(',','abc','def','gh') abc,def,gh### 截取substr('abcde',3) cdesubstr('abcde',-1) esubstr('abcde',3,2) cdsubstr('abcde',-2,2) de
cube、rollup
*rollup从右往左进行上卷
*cube是任意维度上卷
SELECT 部门,员工,SUM(工资)AS TOTAL
FROM DEPART
GROUP BY 部门,员工
WITH CUBE
===就是groupby部门,员工 unionall groupby部门 unionall groupby员工
SELECT A,B,C,SUM(工资)AS TOTAL
FROM DEPART
GROUP BY A,B,C
WITH rollup
===就是groupbyA,B,C unionall groupbyA,B unionall A
时间操作
时间戳 >>> 字符串
from_unixtime(1323308943,'yyyy-MM-dd HH:mm:ss') >>> 2011-12-08 12:12:12
字符串 >>> 时间戳
unix_timestamp() 当前时间戳
unix_timestamp('2011-12-07 13:01:03') 必须是yyyy-MM-dd HH:mm:ss格式的字符串
unix_timestamp('20111207 13:01:03','yyyyMMddHH:mm:ss') 可以自定义格式
year('2011-12-08 10:03:01') 2011
month('2011-12-08 10:03:01') 12
day('2011-12-08 10:03:01') 8
hour('2011-12-08 10:03:01') 10
minute('2011-12-08 10:03:01') 3
second('2011-12-08 10:03:01') 1
# add_mouths函数
select add_months('2020-08-05', -1);
# date_format
select date_format('2020-06-14','yyyy-MM'); >>>2020-06
# 第一个参数减第二个参数
datediff(string enddate, string startdate)
date_add(string startdate, int days)
date_sub (string startdate, int days)
# 当前时间
SELECT CURRENT_DATE;>>> 2017-06-15
select unix_timestamp(); >>> 1562717890
SELECT CURRENT_TIMESTAMP;>>> 2017-06-15 19:54:44
SELECT from_unixtime(unix_timestamp(),'yyyy-MM-dd '); >>> 2017-06-15 19:55:04
**********************************************************
SELECT extract(year FROM '2020-08-05 09:30:08'); -- 结果为 2020
SELECT extract(quarter FROM '2020-08-05 09:30:08'); -- 结果为 3
SELECT extract(month FROM '2020-08-05 09:30:08'); -- 结果为 8
SELECT extract(week FROM '2020-08-05 09:30:08'); -- 结果为 31,一年中的第几周
SELECT extract(day FROM '2020-08-05 09:30:08'); -- 结果为 5
SELECT extract(hour FROM '2020-08-05 09:30:08'); -- 结果为 9
SELECT extract(minute FROM '2020-08-05 09:30:08'); -- 结果为 30
SELECT extract(second FROM '2020-08-05 09:30:08'); -- 结果为 8
************************************************************************周操作
-- 2020-08-05为周三
SELECT next_day('2020-08-05','MO') -- 下一个周一对应的日期:2020-08-10
SELECT next_day('2020-08-05','TU') -- 下一个周二对应的日期:2020-08-11
SELECT next_day('2020-08-05','WE') -- 下一个周三对应的日期:2020-08-12
SELECT next_day('2020-08-05','TH') -- 下一个周四对应的日期:2020-08-06,即为本周四
SELECT next_day('2020-08-05','FR') -- 下一个周五对应的日期:2020-08-07,即为本周五
SELECT next_day('2020-08-05','SA') -- 下一个周六对应的日期:2020-08-08,即为本周六
SELECT next_day('2020-08-05','SU') -- 下一个周日对应的日期:2020-08-09,即为本周日
#当前周的周一
SELECT date_add(next_day('2020-08-05','MO'),-7);
#当前周的周日
select date_add(next_day('2020-08-05','MO'),-1);
************************************************************************月操作
#当前月的最后一天
SELECT last_day('2020-08-05'); -- 2020-08-31
#当前月的第一天
select trunc("2020-08-05",'MM’) -- 2020-08-01
正则
### 将字符串subject按照pattern正则表达式的规则拆分,返回index指定的字符
regexp_extract(string subject, string pattern, int index)
select regexp_extract('["4873748","666"]','([0-9]+)(.*?)([0-9]+)',0); 4873748","666
select regexp_extract('["4873748","666"]','([0-9]+)(.*?)([0-9]+)',1); 4873748
select regexp_extract('["4873748","666"]','([0-9]+)(.*?)([0-9]+)',2); ","
select regexp_extract('["4873748","666"]','([0-9]+)(.*?)([0-9]+)',3); 666
select regexp_extract('我爱你123zhongguo666!','([0-9]+)',0) 123
select regexp_extract('我爱你123zhongguo666!','(\\D+)',0) 我爱你
### 将字符串A中的符合java正则表达式B的部分替换为C
regexp_replace(string A, string B, string C)
regexp_replace('foobar', 'oo|ar', '') 》》》fb
select regexp_replace('我爱你123zhongguo666!','([0-9]+)','') 爱你zhongguo!
select regexp_replace('我爱你123zhongguo666!','(\\D+)','') 123666
### regexp
select * from coupon_info where tactics_type regexp '[0-9]{1,}' limit 10
4个by
- spark-sql中因为shuffle分区共500个 导致小文件有500个。此时可以使用distribute by减少小文件。
distribute by 1:即使动态分区,每个分区也只有一个小文件。
1)Order By:全局排序,只有一个Reducer;
2)Sort By:分区内有序;
3)distribute By:类似MR中Partition,进行分区,结合sort by使用。
4)Cluster By:当Distribute by和Sorts by字段相同时,可以使用Cluster by方式。Cluster by除了具有Distribute by的功能外还兼具Sort by的功能。但是排序【只能】是升序排序,不能指定排序规则为ASC或者DESC。
在生产环境中Order By用的比较少,容易导致OOM。
在生产环境中Sort By+ Distrbute By用的多
*******************************************
#先按照部门编号分区,再按照员工编号降序排序
#要求DISTRIBUTE BY语句要写在SORT BY语句之前
select * from emp
distribute by deptno
sort by empno desc;
#当distribute by和sorts by字段相同时,可以使用cluster by方式,但是排序方式只能是asc 不能指定desc。
select * from emp distribute by deptno sort by deptno;
等价于
select * from emp cluster by deptno;
列转行
20141018 aa|bb 7|9|0|3
20141019 cc|dd 6|1|8|5
20141019 cc
20141019 dd
使用方式:table_1、table_2 随意起
select table.datenu,
table_1.des,
table_2.type
from tb_split as table
lateral view explode(split(des,"//|")) table_1 as des
lateral view explode(split(type,"//|")) table_2 as type;
行转列
a b 1
a b 2
a b 3
c d 4
c d 5
c d 6
a b 1,2,3
c d 4,5,6
select col1,
col2,
concat_ws(',',collect_set(col3))
from tmp_jiangzl_test
group by col1,col2;
collect_list 不去重,collect_set 去重,他们都返回array
上面concat_ws转不过来可以用空格(' ')替代逗号(',')
########### group by 把string转array
hive (gmall)> select * from stud;
name area course score
zhang3 bj math 88
li4 bj math 99
wang5 sh chinese 92
zhao6 sh chinese 54
tian7 bj chinese 91
#聚合成一个集合
hive (gmall)> select course,
collect_set(area),
avg(score)
from stud
group by course;
chinese ["sh","bj"] 79.0
math ["bj"] 93.5
#用下标取第一个
hive (gmall)> select course,
collect_set(area)[0],
avg(score)
from stud
group by course;
chinese sh 79.0
math bj 93.5
array
create table class_test(
name string,
student_id_list array<INT>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' --字段按照”,"分割
COLLECTION ITEMS TERMINATED BY ':'; --array按照:分割
#数据
034,1:2:3:4
035,5:6
036,7:8:9:10
#函数
student_id_list[0]
where array_contains(student_id_list, ‘5’) 》》》035 字段包含5的一条
size()
#array 一行转多行
select table.name,
table1.list_item
from class_test as table
LATERAL VIEW explode(table.student_id_list) table1 as list_item
map
create table employee(
id string,
perf map<string, int>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ',' --map按照,分割
MAP KEYS TERMINATED BY ':'; --keyvalue按照:分割
#数据
1 job:80,team:60,person:70
2 job:60,team:80
3 job:90,team:70,person:100
#函数
perf['person']》》》70
size(perf)》》》3
map_keys(perf)》》》["job","tean","person"]
map_values(perf)》》》["80","60","70"]
#解析map
{reason1:[activity_id1,activity_id2,activity_id3],
reason2:[activity_id1,activity_id2,activity_id3],}
with
table_tmp as (
select
req_id,
filtered --{"FILTERED_BY_CUSTOM_DMP":[48643,47483],"CPM_LT_BIDFLOOR":[48921]}
from ad.ad_filter_stat
where data_date>=2020121500 and data_date<=2020121523)
select
reason,
activity_ids
from table_tmp
lateral view explode(filtered) table_lv as reason,activity_ids;
>>>
reason1:[activity_id1,activity_id2,activity_id3]
reason2:[activity_id1,activity_id2,activity_id3]
Struct
(Struct就是map不存储key)
create table student_test(
id INT,
info struct<name:STRING, age:INT>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',’
COLLECTION ITEMS TERMINATED BY ':'; --按照冒号分割
#数据
1,zhou:30
2,yan:30
3,chen:20
4,li:80
#函数
info.age
json存储解析
********************************************
1、直接存string。用get_json_object函数
create external table marvin_test_join
(
json string
)
STORED AS textfile ;
select
count(day)
from marvin_test_join
where day = 20181220
and get_json_object(json,'$.network') = 'Mobupps_Install';
*******************************************
2、自动解析:load的是json,查的是字段
add jar hdfs:/jar-file/json-serde-1.3.7-jar-with-dependencies.jar;
CREATE TABLE blued_overseas.sd_cost_google_doc
(
country string,
agency string,
network_name string,
os_name string,
cost string,
day string
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
LOCATION '/blued/blued_overseas/sd_cost_google_doc';
load data local inpath '/data/PRODUCTION_DATA/google_doc/cost.json' overwrite into table blued_overseas.sd_cost_google_doc;
#查询要先添加jar
add jar hdfs:/jar-file/json-serde-1.3.7-jar-with-dependencies.jar;
select country from blued_overseas.sd_cost_google_doc;
