Hive - Hive常用命令 - 《大数据》

设置jvm重用
输入合并
设置内存
设置磁盘
设置并行度
设置任务数
设置资源数
设置动态分区
本地化

设置jvm重用

set  mapred.job.reuse.jvm.num.tasks=10;

输入合并

set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
#每个map处理的数据理大小(单位：字节),同时也设置map任务数，注：这里的任务数，实际上就是container的数量
set mapred.max.split.size=256000000;
#一个节点上split的大小，这个参数控制了合并小文件的阈值。
set mapred.min.split.size.per.node=100000000;

设置内存

#map进程内存大小，其中：包括堆内存和非堆内存
set mapreduce.map.memory.mb=2048
#map堆内存大小，一般设置为mapreduce.map.memory.mb的0.8倍
set mapreduce.map.java.opts=-Xmx1638M
#reduce进程内存大小，其中：包括堆内存和非堆内存
set mapreduce.reduce.memory.mb=2048
#reduce堆内存大小，一般设置为mapreduce.reduce.memory.mb的0.8倍
set mapreduce.reduce.java.opts=-Xmx1638M
#map环形缓冲区大小，默认为100m
set mapreduce.task.io.sort.mb=200
#map环形缓冲区溢写比例，默认为0.8
set mapreduce.map.sort.spill.percent=0.9
#设置reduce shuffle读的内存缓冲区大小，默认为reduce可用内存的0.7
set mapreduce.reduce.shuffle.input.buffer.percent=0.7
#buffer中的数据达到多少比例开始写入磁盘。默认值0.66
set mapreduce.reduce.shuffle.merge.percent=0.66

mapreduce.map.memory.mb
官网解释：
The amount of memory to request from the scheduler for each map task. If this is not specified or is non-positive, it is inferred from mapreduce.map.java.opts and mapreduce.job.heap.memory-mb.ratio. If java-opts are also not specified, we set it to 1024.（如果为指定或为非正数，则从mapreduce.map.java.opts和mapreduce.job.heap.memory-mb.ratio两个参数得出。如果上面两个参数也为设置，那么默认为1024M。）

mapreduce.reduce.memory.mb
The amount of memory to request from the scheduler for each reduce task. If this is not specified or is non-positive, it is inferred from mapreduce.reduce.java.opts and mapreduce.job.heap.memory-mb.ratio. If java-opts are also not specified, we set it to 1024.

设置磁盘

#一次merge同时合并的spill数，默认为10次
#通过调大该参数，可以减少merge次数
set mapreduce.task.io.sort.factor=20
# Combiner存在的时候，此时会根据Combiner定义的函数对map的结果进行合并，什么时候进行Combiner操作呢？？？
# 和Map在一个JVM中，是由min.num.spill.for.combine的参数决定的，默认是3，
# 也就是说spill的文件数在默认情况下由三个的时候就要进行combine操作，最终减少磁盘数据；
set min.num.spill.for.combine=3
# 减少磁盘IO和网络IO还可以进行：压缩，对spill，merge文件都可以进行压缩。
# 中间结果非常的大，IO成为瓶颈的时候压缩就非常有用，可以通过mapreduce.map.output.compress（default：false）设置为true进行压缩，
# 数据会被压缩写入磁盘，读数据读的是压缩数据需要解压，在实际经验中Hive在Hadoop的运行的瓶颈一般都是IO而不是CPU，压缩一般可以10倍的减少IO操作，
# 压缩的方式Gzip，Lzo,BZip2,Lzma等，其中Lzo是一种比较平衡选择，mapreduce.map.output.compress.codec（default：org.apache.hadoop.io.compress.DefaultCodec）参数设置。
# 但这个过程会消耗CPU，适合IO瓶颈比较大。
set mapreduce.map.output.compress=true
mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.SnappyCodec

设置并行度

# 开启任务并行执行，对于可以并行执行的sql（如uion等）可以大大提高执行效率
# 如：select count(1) from a group by id
#     union all 
#     select count(1) from b group by id
#上面的sql，包含两个mapreduce，每个mapreduce就是一个job，在yarn上面，每个job对应一个app
#，每个app有自己的appid，每个app包含appmaster容器、mapper容器、reducer容器（如果有reduce的话）
#当开启hive.exec.parallel=true，那么上面两个app就会并发执行。
set hive.exec.parallel=true;
# 允许并行任务的最大线程数，实际上就是限制并发执行的app个数。
set hive.exec.parallel.thread.number=16;
# 默认情况下，当整个MapReduce作业的所有已执行完成的Map Task任务数超过Map
# Task总数的 mapreduce.job.reduce.slowstart.completedmaps (默认为0.05) 后，ApplicationMaster便会开始调度执行Reduce Task任务。
set mapreduce.job.reduce.slowstart.completedmaps=0.05
#每个reduce去map端下载数据的并行度，默认为5
set mapreduce.reduce.shuffle.parallelcopies=5
#每个Map task可使用的最多cpu core数目，默认值: 1
set mapreduce.map.cpu.vcores=1
#每个Reduce task可使用的最多cpu core数目，默认值: 1
set mapreduce.reduce.cpu.vcores=1

设置任务数

# 控制reduce的任务数
set mapred.reduce.tasks=20;
# 设置每个reducer处理的数据量(单位：字节),这个参数同时也是设置reducer任务数
set hive.exec.reducers.bytes.per.reducer=100000000
# 每个map处理的数据理大小(单位：字节),同时也设置map任务数，注：这里的任务数，实际上就是container的数量
set mapred.max.split.size=256000000;
#一个节点上split的大小，这个参数控制了合并小文件的阈值。
set mapred.min.split.size.per.node=100000000;
# 在map-only的任务合并map输出小文件
set hive.merge.mapfiles=true
#合并mapreduce输出小文件
set hive.merge.mapredfiles=true
#合并文件大小
set hive.merge.size.per.task=256*1000*1000
#输出文件的平均大小小于该值,启动一个独立的mapreduce程序合并小文件
set hive.merge.smallfiles.avgsize=16*1000*1000

设置资源数

# 指定资源队列,root.urgent
set mapred.job.queue.name=root.default;
# container最小可申请内存量
set yarn.scheduler.minimum-allocation-mb=1024;
# container最大可申请内存量
set yarn.scheduler.maximum-allocation-mb=32768;
# container最小可申请CPU数
set yarn.scheduler.minimum-allocation-vcores=1;
# container最大可申请CPU数
set yarn.scheduler.maximum-allocation-vcores=16;
# AM Container Heap内存大小
set yarn.app.mapreduce.am.command-opts=-Xmx2048M;
# AM Container内存大小
set yarn.app.mapreduce.am.resource.mb=4096;
# NodeManger可用内存大小。如果没有开启自动推算，那么默认值为8192MB
set yarn.nodemanager.resource.memory-mb=57344;
# NodeManger可用CPU数。如果没有开启自动推送，那么默认值为8
set yarn.nodemanager.resource.cpu-vcores=16;

设置动态分区

#（默认false）,表示开启动态分区功能
set hive.exec.dynamic.partition =true
#(默认strict),表示允许所有分区都是动态的，否则必须有静态分区字段
set hive.exec.dynamic.partition.mode = nonstrict
动态分区相关的调优参数：
#表示每个maper或reducer可以允许创建的最大动态分区个数，默认是100，超出则会报错。
set  hive.exec.max.dynamic.partitions.pernode=100 #（默认100，一般可以设置大一点，比如1000）
#表示一个动态分区语句可以创建的最大动态分区个数，超出报错
set hive.exec.max.dynamic.partitions =1000 #(默认值) 
#全局可以创建的最大文件个数，超出报错。
set hive.exec.max.created.files =10000 #(默认)

本地化

set mapreduce.framework.name=local