— 查询测试表t1（无重复数据）

select * from (select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) as rn from scada_history.t1) as t where t.rn>1;

insert overwrite table scada_history.t4 select t.id,t.name,t.age from (select *,row_number()over(distribute by id,name,age sort by id) as rn from scada_history.t4) as t where t.rn=1 and id='1';

— t1表有958026条不重复数据数据

select count(1) from t1;

— 多次（3次）导入t1表数据到t3表（表结构相同）

insert into table t3 partition(year,mounth,day) select * from t1;
insert into table t3 partition(year,mounth,day) select * from t1;
insert into table t3 partition(year,mounth,day) select * from t1;

— t1、t3建表语句

CREATE TABLE `scada_history.t1`( --CREATE TABLE `scada_history.t3`(
  `point_time` string,
  `tagname` string,
  `value_double` double)
PARTITIONED BY (
  `year` string,
  `mounth` string,
  `day` string)
ROW FORMAT SERDE

— 测试

— 查询t3表有重复数据

select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) from scada_history.t3 where year="2014" and mounth="05" and day="14" limit 12;
--------------------------------------------------------------------------------------------
OK
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 00:29:06     63.529999       2014    05      14      1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 00:29:06     63.529999       2014    05      14      2
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 00:29:06     63.529999       2014    05      14      3
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 01:29:25     63.529999       2014    05      14      1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 01:29:25     63.529999       2014    05      14      2
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 01:29:25     63.529999       2014    05      14      3
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 02:29:45     63.529999       2014    05      14      1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 02:29:45     63.529999       2014    05      14      2
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 02:29:45     63.529999       2014    05      14      3
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 03:29:54     63.529999       2014    05      14      1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 03:29:54     63.529999       2014    05      14      2
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 03:29:54     63.529999       2014    05      14      3
Time taken: 26.894 seconds, Fetched: 12 row(s)

— count t3表

select count(1) from t3;
--------------------------------------------------------------------------------------------
Total MapReduce CPU Time Spent: 8 seconds 320 msec
OK
2874078  -- 2874078/3=958026
Time taken: 38.787 seconds, Fetched: 1 row(s)

— 执行表内去重

insert overwrite table scada_history.t3 partition(year,mounth,day) select t.point_time,t.tagname,t.value_double,t.year,t.mounth,t.day from (select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) as rn from scada_history.t3) as t where t.rn=1;

— 执行计数

select count(1) from t3;
--------------------------------------------------------------------------------------------
Total MapReduce CPU Time Spent: 6 seconds 740 msec
OK
958026
Time taken: 35.925 seconds, Fetched: 1 row(s)

— 执行查询t3表没有重复数据

select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) from scada_history.t3 where year="2014" and mounth="05" and day="14" limit 5;
--------------------------------------------------------------------------------------------
Total MapReduce CPU Time Spent: 6 seconds 640 msec
OK
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 00:29:06     63.529999       2014    05      14      1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 01:29:25     63.529999       2014    05      14      1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 02:29:45     63.529999       2014    05      14      1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 03:29:54     63.529999       2014    05      14      1
NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004  2014-05-14 04:30:13     63.529999       2014    05      14      1
Time taken: 37.32 seconds, Fetched: 5 row(s)

— 测试1（覆盖指定分区）

— 建表

create table t4(
id int,
name string
)
partitioned by (year string,month string,day string);

— 插入测试数据

insert into table t4 partition(year,month,day) values (1,'zs','2021','05','10'),(1,'zs','2021','05','10'),(1,'zs','2021','05','10'),(1,'zs','2021','05','09'),(1,'zs','2021','05','09'),(1,'zs','2021','05','09'),(1,'zs','2021','06','09'),(1,'zs','2021','06','09'),(1,'zs','2021','06','09'),(2,'ls','2021','05','10'),(2,'ls','2021','05','10'),(2,'ls','2021','05','10'),(2,'ls','2021','05','09'),(2,'ls','2021','05','09'),(2,'ls','2021','05','09'),(2,'ls','2021','06','09'),(2,'ls','2021','06','09'),(2,'ls','2021','06','09');

— 指定分区为5月的分区进行去重

insert overwrite table scada_history.t4 partition(year,month,day) select t.id,t.name,t.year,t.month,t.day from (select id,name,year,month,day,row_number()over(distribute by id,name,year,month,day sort by id) as rn from scada_history.t4) as t where t.rn=1 and t.year="2021" and t.month="05" and t.day="09";

— day为09的分区被覆盖，其余分区未执行复写操作

— 指定分区为月

insert overwrite table scada_history.t4 partition(year,month,day) select t.id,t.name,t.year,t.month,t.day from (select id,name,year,month,day,row_number()over(distribute by id,name,year,month,day sort by id) as rn from scada_history.t4) as t where t.rn=1 and t.year="2021" and t.month="05";

— 分区为05月的被复写去重，06分区未被去重

— 测试2（覆盖指定分区）

— 建表

create table t5(
id int,
name string
)
partitioned by (times string);

— 插入数据

set hive.exec.dynamic.partition.mode=nonstrict;
insert into table scada_history.t5 partition(times) values (1,'zs','2021-05-01'),(1,'zs','2021-05-01'),(1,'zs','2021-05-01'),(1,'zs','2021-05-02'),(1,'zs','2021-05-02'),(1,'zs','2021-05-02'),(2,'ls','2021-05-02'),(2,'ls','2021-05-02'),(2,'ls','2021-05-02'),(2,'ls','2021-05-03'),(2,'ls','2021-05-03'),(2,'ls','2021-05-03'),(2,'ls','2021-05-04'),(2,'ls','2021-05-04'),(2,'ls','2021-05-04'),(2,'ls','2021-05-05'),(2,'ls','2021-05-05'),(2,'ls','2021-05-05'),(2,'ls','2021-05-06'),(2,'ls','2021-05-06'),(2,'ls','2021-05-06'),(2,'ls','2021-05-07'),(2,'ls','2021-05-07'),(2,'ls','2021-05-07'),(2,'ls','2021-05-08'),(2,'ls','2021-05-08'),(2,'ls','2021-05-08'),(2,'ls','2021-05-09'),(2,'ls','2021-05-09'),(2,'ls','2021-05-09'),(2,'ls','2021-05-10'),(2,'ls','2021-05-10'),(2,'ls','2021-05-10');

— 指定分区为 ‘2021-05-01’ 到 ‘2021-05-03’ 的分区进行去重

insert overwrite table scada_history.t5 partition(times) select t.id,t.name,t.times from (select id,name,times,row_number()over(distribute by id,name,times sort by id) as rn from scada_history.t5) as t where t.rn=1 and t.times in ('2021-05-01','2021-05-02','2021-05-03');

— 查询分区为 ‘2021-05-01’ 到 ‘2021-05-03’ 的结果无重复数据

— 区间进行where条件判断

insert overwrite table scada_history.t5 partition(times) select t.id,t.name,t.times from (select id,name,times,row_number()over(distribute by id,name,times sort by id) as rn from scada_history.t5) as t where t.rn=1 and t.times >='2021-05-04' and t.times <= '2021-05-06';

— 查询 ‘2021-05-04’ 到 ‘2021-05-06’的分区成功去重

结论：

06以后的分区有重复数据，不同分区内数据不会被覆盖

大数据

hive表内去重

— 查询测试表t1（无重复数据）

— t1表有958026条不重复数据数据

— 查询t3表有重复数据

— 测试1（覆盖指定分区）

— 测试2（覆盖指定分区）

结论：