数据倾斜
set hive.optimize.skewjoin=true;
group by 优化
set hive.map.aggr=true; — 负载均衡 (group by预聚合)
set hive.grouby.mapaggr.checkinterval=1000000;
set hive.groupby.skewindata=true; — group by倾斜优化
向量化查询 优化
set hive.vectorized.execution.enabled=true; — 默认为关 提高了表扫描、聚合、过滤和连接等操作的性能,有几率报错
set hive.vectorized.execution.reduce.enabled=true;
reduce 优化
set hive.exec.reducers.max=3009; — 时间长 可以调大 默认1009
map join 优化
set hive.auto.convert.join=true;
set hive.mapjoin.smalltable.filesize=500000000; — 设置为500MB
set hive.auto.convert.join.noconditionaltask=true;
set hive.auto.convert.join.noconditionaltask.size=50000000; — 设置为50MB
set hive.auto.convert.join.use.nonstaged=true;
小文件合并
临时表
set hive.merge.sparkfiles=false;
正式表
set hive.merge.sparkfiles=true;
set hive.merge.size.per.task=1024000000;
set hive.merge.smallfiles.avgsize=1024000000;
动态并发分配任务数据
set spark.executor.instances=1;
set spark.dynamicAllocation.enabled=true;
set spark.dynamicAllocation.initialExecutors=1;
set spark.dynamicAllocation.minExecutors=0;
set spark.dynamicAllocation.maxExecutors=300;
set spark.dynamicAllocation.schedulerBacklogTimeout=1s;
set spark.dynamicAllocation.executorIdleTimeout=60s;
前台查询
set hive.fetch.task.conversion=more; — 转换为单个 FETCH 任务
其他
set hive.mapjoin.optimized.hashtable=false;
