— 查询测试表t1(无重复数据)
select * from (select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) as rn from scada_history.t1) as t where t.rn>1;
insert overwrite table scada_history.t4 select t.id,t.name,t.age from (select *,row_number()over(distribute by id,name,age sort by id) as rn from scada_history.t4) as t where t.rn=1 and id='1';
— t1表有958026条不重复数据数据
select count(1) from t1;
— 多次(3次)导入t1表数据到t3表(表结构相同)
insert into table t3 partition(year,mounth,day) select * from t1;
insert into table t3 partition(year,mounth,day) select * from t1;
insert into table t3 partition(year,mounth,day) select * from t1;
— t1、t3建表语句
CREATE TABLE `scada_history.t1`( --CREATE TABLE `scada_history.t3`(
`point_time` string,
`tagname` string,
`value_double` double)
PARTITIONED BY (
`year` string,
`mounth` string,
`day` string)
ROW FORMAT SERDE
— 查询t3表有重复数据
select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) from scada_history.t3 where year="2014" and mounth="05" and day="14" limit 12;
--------------------------------------------------------------------------------------------
OK
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 00:29:06 63.529999 2014 05 14 1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 00:29:06 63.529999 2014 05 14 2
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 00:29:06 63.529999 2014 05 14 3
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 01:29:25 63.529999 2014 05 14 1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 01:29:25 63.529999 2014 05 14 2
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 01:29:25 63.529999 2014 05 14 3
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 02:29:45 63.529999 2014 05 14 1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 02:29:45 63.529999 2014 05 14 2
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 02:29:45 63.529999 2014 05 14 3
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 03:29:54 63.529999 2014 05 14 1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 03:29:54 63.529999 2014 05 14 2
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 03:29:54 63.529999 2014 05 14 3
Time taken: 26.894 seconds, Fetched: 12 row(s)
— count t3表
select count(1) from t3;
--------------------------------------------------------------------------------------------
Total MapReduce CPU Time Spent: 8 seconds 320 msec
OK
2874078 -- 2874078/3=958026
Time taken: 38.787 seconds, Fetched: 1 row(s)
— 执行表内去重
insert overwrite table scada_history.t3 partition(year,mounth,day) select t.point_time,t.tagname,t.value_double,t.year,t.mounth,t.day from (select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) as rn from scada_history.t3) as t where t.rn=1;
— 执行计数
select count(1) from t3;
--------------------------------------------------------------------------------------------
Total MapReduce CPU Time Spent: 6 seconds 740 msec
OK
958026
Time taken: 35.925 seconds, Fetched: 1 row(s)
— 执行查询t3表没有重复数据
select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) from scada_history.t3 where year="2014" and mounth="05" and day="14" limit 5;
--------------------------------------------------------------------------------------------
Total MapReduce CPU Time Spent: 6 seconds 640 msec
OK
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 00:29:06 63.529999 2014 05 14 1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 01:29:25 63.529999 2014 05 14 1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 02:29:45 63.529999 2014 05 14 1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 03:29:54 63.529999 2014 05 14 1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 04:30:13 63.529999 2014 05 14 1
Time taken: 37.32 seconds, Fetched: 5 row(s)
— 测试1(覆盖指定分区)
— 建表
create table t4(
id int,
name string
)
partitioned by (year string,month string,day string);
— 插入测试数据
insert into table t4 partition(year,month,day) values (1,'zs','2021','05','10'),(1,'zs','2021','05','10'),(1,'zs','2021','05','10'),(1,'zs','2021','05','09'),(1,'zs','2021','05','09'),(1,'zs','2021','05','09'),(1,'zs','2021','06','09'),(1,'zs','2021','06','09'),(1,'zs','2021','06','09'),(2,'ls','2021','05','10'),(2,'ls','2021','05','10'),(2,'ls','2021','05','10'),(2,'ls','2021','05','09'),(2,'ls','2021','05','09'),(2,'ls','2021','05','09'),(2,'ls','2021','06','09'),(2,'ls','2021','06','09'),(2,'ls','2021','06','09');
— 指定分区为5月的分区进行去重
insert overwrite table scada_history.t4 partition(year,month,day) select t.id,t.name,t.year,t.month,t.day from (select id,name,year,month,day,row_number()over(distribute by id,name,year,month,day sort by id) as rn from scada_history.t4) as t where t.rn=1 and t.year="2021" and t.month="05" and t.day="09";
— day为09的分区被覆盖,其余分区未执行复写操作
— 指定分区为月
insert overwrite table scada_history.t4 partition(year,month,day) select t.id,t.name,t.year,t.month,t.day from (select id,name,year,month,day,row_number()over(distribute by id,name,year,month,day sort by id) as rn from scada_history.t4) as t where t.rn=1 and t.year="2021" and t.month="05";
— 分区为05月的被复写去重,06分区未被去重
— 测试2(覆盖指定分区)
— 建表
create table t5(
id int,
name string
)
partitioned by (times string);
— 插入数据
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table scada_history.t5 partition(times) values (1,'zs','2021-05-01'),(1,'zs','2021-05-01'),(1,'zs','2021-05-01'),(1,'zs','2021-05-02'),(1,'zs','2021-05-02'),(1,'zs','2021-05-02'),(2,'ls','2021-05-02'),(2,'ls','2021-05-02'),(2,'ls','2021-05-02'),(2,'ls','2021-05-03'),(2,'ls','2021-05-03'),(2,'ls','2021-05-03'),(2,'ls','2021-05-04'),(2,'ls','2021-05-04'),(2,'ls','2021-05-04'),(2,'ls','2021-05-05'),(2,'ls','2021-05-05'),(2,'ls','2021-05-05'),(2,'ls','2021-05-06'),(2,'ls','2021-05-06'),(2,'ls','2021-05-06'),(2,'ls','2021-05-07'),(2,'ls','2021-05-07'),(2,'ls','2021-05-07'),(2,'ls','2021-05-08'),(2,'ls','2021-05-08'),(2,'ls','2021-05-08'),(2,'ls','2021-05-09'),(2,'ls','2021-05-09'),(2,'ls','2021-05-09'),(2,'ls','2021-05-10'),(2,'ls','2021-05-10'),(2,'ls','2021-05-10');
— 指定分区为 ‘2021-05-01’ 到 ‘2021-05-03’ 的分区进行去重
insert overwrite table scada_history.t5 partition(times) select t.id,t.name,t.times from (select id,name,times,row_number()over(distribute by id,name,times sort by id) as rn from scada_history.t5) as t where t.rn=1 and t.times in ('2021-05-01','2021-05-02','2021-05-03');
— 查询 分区为 ‘2021-05-01’ 到 ‘2021-05-03’ 的结果无重复数据
— 区间进行where条件判断
insert overwrite table scada_history.t5 partition(times) select t.id,t.name,t.times from (select id,name,times,row_number()over(distribute by id,name,times sort by id) as rn from scada_history.t5) as t where t.rn=1 and t.times >='2021-05-04' and t.times <= '2021-05-06';
— 查询 ‘2021-05-04’ 到 ‘2021-05-06’的分区成功去重
结论:
06以后的分区有重复数据,不同分区内数据不会被覆盖