— 查询测试表t1(无重复数据)

  1. select * from (select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) as rn from scada_history.t1) as t where t.rn>1;

  1. insert overwrite table scada_history.t4 select t.id,t.name,t.age from (select *,row_number()over(distribute by id,name,age sort by id) as rn from scada_history.t4) as t where t.rn=1 and id='1';

— t1表有958026条不重复数据数据

  1. select count(1) from t1;

— 多次(3次)导入t1表数据到t3表(表结构相同)

  1. insert into table t3 partition(year,mounth,day) select * from t1;
  2. insert into table t3 partition(year,mounth,day) select * from t1;
  3. insert into table t3 partition(year,mounth,day) select * from t1;

— t1、t3建表语句

  1. CREATE TABLE `scada_history.t1`( --CREATE TABLE `scada_history.t3`(
  2. `point_time` string,
  3. `tagname` string,
  4. `value_double` double)
  5. PARTITIONED BY (
  6. `year` string,
  7. `mounth` string,
  8. `day` string)
  9. ROW FORMAT SERDE

— 测试

— 查询t3表有重复数据

  1. select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) from scada_history.t3 where year="2014" and mounth="05" and day="14" limit 12;
  2. --------------------------------------------------------------------------------------------
  3. OK
  4. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 00:29:06 63.529999 2014 05 14 1
  5. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 00:29:06 63.529999 2014 05 14 2
  6. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 00:29:06 63.529999 2014 05 14 3
  7. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 01:29:25 63.529999 2014 05 14 1
  8. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 01:29:25 63.529999 2014 05 14 2
  9. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 01:29:25 63.529999 2014 05 14 3
  10. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 02:29:45 63.529999 2014 05 14 1
  11. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 02:29:45 63.529999 2014 05 14 2
  12. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 02:29:45 63.529999 2014 05 14 3
  13. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 03:29:54 63.529999 2014 05 14 1
  14. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 03:29:54 63.529999 2014 05 14 2
  15. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 03:29:54 63.529999 2014 05 14 3
  16. Time taken: 26.894 seconds, Fetched: 12 row(s)

— count t3表

  1. select count(1) from t3;
  2. --------------------------------------------------------------------------------------------
  3. Total MapReduce CPU Time Spent: 8 seconds 320 msec
  4. OK
  5. 2874078 -- 2874078/3=958026
  6. Time taken: 38.787 seconds, Fetched: 1 row(s)

— 执行表内去重

  1. insert overwrite table scada_history.t3 partition(year,mounth,day) select t.point_time,t.tagname,t.value_double,t.year,t.mounth,t.day from (select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) as rn from scada_history.t3) as t where t.rn=1;

— 执行计数

  1. select count(1) from t3;
  2. --------------------------------------------------------------------------------------------
  3. Total MapReduce CPU Time Spent: 6 seconds 740 msec
  4. OK
  5. 958026
  6. Time taken: 35.925 seconds, Fetched: 1 row(s)

— 执行查询t3表没有重复数据

  1. select point_time,tagname,value_double,year,mounth,day,row_number()over(distribute by point_time,tagname,value_double,year,mounth,day sort by point_time) from scada_history.t3 where year="2014" and mounth="05" and day="14" limit 5;
  2. --------------------------------------------------------------------------------------------
  3. Total MapReduce CPU Time Spent: 6 seconds 640 msec
  4. OK
  5. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 00:29:06 63.529999 2014 05 14 1
  6. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 01:29:25 63.529999 2014 05 14 1
  7. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 02:29:45 63.529999 2014 05 14 1
  8. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 03:29:54 63.529999 2014 05 14 1
  9. NSSDJL.NX_GD_NSSF_DD_P1_L1_001_FXWG004 2014-05-14 04:30:13 63.529999 2014 05 14 1
  10. Time taken: 37.32 seconds, Fetched: 5 row(s)


— 测试1(覆盖指定分区)

— 建表

  1. create table t4(
  2. id int,
  3. name string
  4. )
  5. partitioned by (year string,month string,day string);

— 插入测试数据

  1. insert into table t4 partition(year,month,day) values (1,'zs','2021','05','10'),(1,'zs','2021','05','10'),(1,'zs','2021','05','10'),(1,'zs','2021','05','09'),(1,'zs','2021','05','09'),(1,'zs','2021','05','09'),(1,'zs','2021','06','09'),(1,'zs','2021','06','09'),(1,'zs','2021','06','09'),(2,'ls','2021','05','10'),(2,'ls','2021','05','10'),(2,'ls','2021','05','10'),(2,'ls','2021','05','09'),(2,'ls','2021','05','09'),(2,'ls','2021','05','09'),(2,'ls','2021','06','09'),(2,'ls','2021','06','09'),(2,'ls','2021','06','09');

image.png
— 指定分区为5月的分区进行去重

  1. insert overwrite table scada_history.t4 partition(year,month,day) select t.id,t.name,t.year,t.month,t.day from (select id,name,year,month,day,row_number()over(distribute by id,name,year,month,day sort by id) as rn from scada_history.t4) as t where t.rn=1 and t.year="2021" and t.month="05" and t.day="09";

— day为09的分区被覆盖,其余分区未执行复写操作
image.png

— 指定分区为月

  1. insert overwrite table scada_history.t4 partition(year,month,day) select t.id,t.name,t.year,t.month,t.day from (select id,name,year,month,day,row_number()over(distribute by id,name,year,month,day sort by id) as rn from scada_history.t4) as t where t.rn=1 and t.year="2021" and t.month="05";

— 分区为05月的被复写去重,06分区未被去重
image.png

— 测试2(覆盖指定分区)

— 建表

  1. create table t5(
  2. id int,
  3. name string
  4. )
  5. partitioned by (times string);

— 插入数据

  1. set hive.exec.dynamic.partition.mode=nonstrict;
  2. insert into table scada_history.t5 partition(times) values (1,'zs','2021-05-01'),(1,'zs','2021-05-01'),(1,'zs','2021-05-01'),(1,'zs','2021-05-02'),(1,'zs','2021-05-02'),(1,'zs','2021-05-02'),(2,'ls','2021-05-02'),(2,'ls','2021-05-02'),(2,'ls','2021-05-02'),(2,'ls','2021-05-03'),(2,'ls','2021-05-03'),(2,'ls','2021-05-03'),(2,'ls','2021-05-04'),(2,'ls','2021-05-04'),(2,'ls','2021-05-04'),(2,'ls','2021-05-05'),(2,'ls','2021-05-05'),(2,'ls','2021-05-05'),(2,'ls','2021-05-06'),(2,'ls','2021-05-06'),(2,'ls','2021-05-06'),(2,'ls','2021-05-07'),(2,'ls','2021-05-07'),(2,'ls','2021-05-07'),(2,'ls','2021-05-08'),(2,'ls','2021-05-08'),(2,'ls','2021-05-08'),(2,'ls','2021-05-09'),(2,'ls','2021-05-09'),(2,'ls','2021-05-09'),(2,'ls','2021-05-10'),(2,'ls','2021-05-10'),(2,'ls','2021-05-10');

— 指定分区为 ‘2021-05-01’ 到 ‘2021-05-03’ 的分区进行去重

  1. insert overwrite table scada_history.t5 partition(times) select t.id,t.name,t.times from (select id,name,times,row_number()over(distribute by id,name,times sort by id) as rn from scada_history.t5) as t where t.rn=1 and t.times in ('2021-05-01','2021-05-02','2021-05-03');

— 查询 分区为 ‘2021-05-01’ 到 ‘2021-05-03’ 的结果无重复数据
image.png
— 区间进行where条件判断

  1. insert overwrite table scada_history.t5 partition(times) select t.id,t.name,t.times from (select id,name,times,row_number()over(distribute by id,name,times sort by id) as rn from scada_history.t5) as t where t.rn=1 and t.times >='2021-05-04' and t.times <= '2021-05-06';

— 查询 ‘2021-05-04’ 到 ‘2021-05-06’的分区成功去重
image.png

结论:

06以后的分区有重复数据,不同分区内数据不会被覆盖