1、将hive数据导出到csv:

    1. julia /dwd/bak/yuanhu/julia/tsv_2_excel.jl "订单号,下单日期,业务类型,店铺名称,店铺ID,评价日期,评分,商品编码,SKUID,品牌ID,品牌,商品名称" "string,string,string,string,string,string,string,string,string,string,string,string" "/dwd/bak/张楠池/data/分析POP订单商品评价分析_2021-11-29.tsv" "/dwd/bak/张楠池/data/分析POP订单商品评价分析_2021-11-29.xlsx"
    2. hive -e 'select books from table' | sed 's/[[:space:]]\+/,/g' > /home/lvermeer/temp.csv
    3. beeline -u jdbc:hive2://172.16.16.7:10000 -n dwetl -p ddky@admin --verbose=true --outputformat=tsv2 -e "drop table temp.tmp_aa; create table temp.tmp_aa as select order_id, order_amt from dw.s01_or_order where ds = '2021-11-04' limit 30; select * from temp.tmp_aa;" > /dwd/bak/yuanhu/data/data.csv

    2、yarn 上查看日志

    1. yarn logs -applicationId application_1517538889175_2550 > logs.txt
    2. ##杀死任务
    3. yarn application --kill application_1605802776975_1523527

    3、hive显示列名,表头

    1. set hive.cli.print.header=true;

    4、hive 常用优化设置

    1. --join方面的优化
    2. set hive.exec.mode.local.auto=true;
    3. SET hive.auto.convert.join = TRUE;
    4. SET hive.optimize.correlation = TRUE;
    5. SET hive.auto.convert.join.noconditionaltask = TRUE;
    6. SET hive.exec.parallel = TRUE;
    7. --join倾斜优化
    8. set hive.optimize.skewjoin=true;
    9. set hive.skewjoin.key=100000;
    10. --调整内存
    11. set mapred.reduce.tasks = 20;
    12. set mapreduce.map.memory.mb = 4096;
    13. set mapreduce.reduce.memory.mb = 4096;
    14. set mapreduce.map.java.opts=-Xmx3278m;
    15. set mapreduce.reduce.java.opts=-Xmx3278m;
    16. --map数量控制
    17. set mapred.max.split.size=100000000;
    18. set mapred.min.split.size.per.node=100000000;
    19. set mapred.min.split.size.per.rack=100000000;
    20. set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; --控制map前合并小文件
    21. --reduce数控制
    22. set hive.exec.reducers.bytes.per.reducer=100000000;
    23. set mapred.reduce.tasks = 150;
    24. --谓词下推,尽早对底层数据进行过滤以减少后续以减少后续需要处理的数据量
    25. set hive.optimize.ppd=true;
    26. --在map中会做部分聚合操作,能够使map传送给reduce的数据量大大减少
    27. set hive.map.aggr=true;
    28. set hive.groupby.skewindata=true;
    29. --关于hive union all的优化
    30. Hive countdistinct,group by会导致union all 同表关联扫描多次表,multi insert 不会,可以通过调整count,distinct,group by 位置,先执行union all后进行countdistinct,group by来避免这种情况发生。
    31. --大表与大表关联优化
    32. set hive.auto.convert.sortmerge.join=true;
    33. set hive.optimize.bucketmapjoin = true;
    34. set hive.optimize.bucketmapjoin.sortedmerge = true;
    35. set hive.auto.convert.sortmerge.join.noconditionaltask=true;
    36. --hive on sprak 合并小文件
    37. set hive.merge.sparkfiles=true;
    38. set hive.merge.size.per.task = 256*1000*1000;

    5、hive建表语句

    1. create table temp.tmp_phone_dr (phone string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;
    2. load data local inpath '/dwd/bak/yuanhu/20210206/phone.txt' overwrite into table temp.tmp_phone_dr;

    5、hive添加字段

    1. ALTER TABLE table_nm SET TBLPROPERTIES ('external.table.purge'='true');
    2. --增加cascade强制刷新元数据
    3. alter table table_name add columns (c_time string comment '当前时间') cascade; -- 正确,添加在最后
    4. alter table table_name change c_time c_time string after address ; -- 正确,移动到指定位置,address字段的后面
    5. alter table table_name drop if exists partition(par_col=col_name); --删除分区
    6. Alter table 表名 change column 原字段名称 现字段名称 数据类型
    7. ALTER TABLE table_name SET TBLPROPERTIES('comment' = '表的新注释');
    8. ALTER TABLE test REPLACE COLUMNS(id BIGINT, name STRING);
    9. --mysql
    10. ALTER TABLE device ADD COLUMN `device_type` VARCHAR(32) DEFAULT NULL COMMENT '设备类型';
    11. ALTER TABLE 表名 CHANGE 字段名 新字段名 字段属性
    12. --索引
    13. alter table table_name add index index_name (column_list) ;
    14. create unique index id_index on s49_app_compete_prod_data_monitor(identifies, stat_dd);
    15. --新建主键
    16. alter table `s49_app_shop_all_prod` add primary key (`identifies`);
    1. ADD JAR /home/dwetl/hive_udf-1.0-SNAPSHOT.jar;
    2. CREATE temporary function getMonthDays AS 'com.ddky.bi.GetMonthDay';
    3. hdfs://172.16.16.12:8888/user/
    4. ADD JAR hdfs://bidc/bi/udf_pkg/gettimeendpoit.jar;
    5. CREATE temporary function getTimeEndpoit AS 'com.ddky.bi.GetTimeEndpoit';
    6. --添加永久函数
    7. create function is_blank as 'com.ddky.bi.StringIsBlank' using jar 'hdfs://bidc/bi/udf_pkg/hive_udf_permanent.jar';
    8. create function replace_blank as 'com.ddky.bi.ReplaceBlankUDF' using jar 'hdfs://bidc/bi/udf_pkg/replace_bland_udf.jar';
    9. beeline -u "jdbc:hive2://172.16.16.7:10000" -ndwetl -pddky@admin

    保留小数点后四位 * 100,连接%

    1. concat(round(x/y, 4) * 100, '%')
    1. --hive 2.1.1 之后 jdbc日志获取
    2. <property>
    3. <name>hive.async.log.enabled</name>
    4. <value>false</value>
    5. </property>

    hive 利用反射调取java 代码

    1. select reflect("org.apache.commons.lang.StringUtils", "isNotBlank", "");

    行转列代码:

    1. with t AS (
    2. SELECT user_id
    3. ,str_to_map(concat_ws(',',collect_list(concat_ws(':',mm,concat(order_amt,'_',order_num))))) info
    4. FROM temp.temp_lzl_2019_order
    5. GROUP BY user_id)
    6. SELECT t1.user_id `用户ID`
    7. ,t1.create_tm `用户注册时间`
    8. ,t1.first_tm `首单时间`
    9. ,t1.is_lte_0 `首单是否小于等于1元`
    10. ,t2.2018_order_num `截止2018年12月订单数`
    11. ,t2.2018_order_amt `截止2018年12月订单金额`
    12. ,t3.201901_num `2019年1月订单数`
    13. ,t3.201901_amt `2019年1月金额`
    14. ,t3.201902_num `2019年2月订单数`
    15. ,t3.201902_amt `2019年2月金额`
    16. ,t3.201903_num `2019年3月订单数`
    17. ,t3.201903_amt `2019年3月金额`
    18. ,t3.201904_num `2019年4月订单数`
    19. ,t3.201904_amt `2019年4月金额`
    20. ,t3.201905_num `2019年5月订单数`
    21. ,t3.201905_amt `2019年5月金额`
    22. ,t3.201906_num `2019年6月订单数`
    23. ,t3.201906_amt `2019年6月金额`
    24. ,t3.201907_num `2019年7月订单数`
    25. ,t3.201907_amt `2019年7月金额`
    26. ,t3.201908_num `2019年8月订单数`
    27. ,t3.201908_amt `2019年8月金额`
    28. ,t3.201909_num `2019年9月订单数`
    29. ,t3.201909_amt `2019年9月金额`
    30. ,t3.201910_num `2019年10月订单数`
    31. ,t3.201910_amt `2019年10月金额`
    32. ,t3.201911_num `2019年11月订单数`
    33. ,t3.201911_amt `2019年11月金额`
    34. ,t3.201912_num `2019年12月订单数`
    35. ,t3.201912_amt `2019年12月金额`
    36. ,t3.202001_num `2020年1月订单数`
    37. ,t3.202001_amt `2020年1月金额`
    38. ,t3.202002_num `2020年2月订单数`
    39. ,t3.202002_amt `2020年2月金额`
    40. ,t3.202003_num `2020年3月订单数`
    41. ,t3.202003_amt `2020年3月金额`
    42. ,t3.202004_num `2020年4月订单数`
    43. ,t3.202004_amt `2020年4月金额`
    44. ,t3.202005_num `2020年5月订单数`
    45. ,t3.202005_amt `2020年5月金额`
    46. ,t3.202006_num `2020年6月订单数`
    47. ,t3.202006_amt `2020年6月金额`
    48. ,t3.202007_num `2020年7月订单数`
    49. ,t3.202007_amt `2020年7月金额`
    50. ,t3.202008_num `2020年8月订单数`
    51. ,t3.202008_amt `2020年8月金额`
    52. ,t3.202009_num `2020年9月订单数`
    53. ,t3.202009_amt `2020年9月金额`
    54. ,t3.202010_num `2020年10月订单数`
    55. ,t3.202010_amt `2020年10月金额`
    56. FROM
    57. (
    58. SELECT user_id
    59. ,create_tm
    60. ,first_tm
    61. ,is_lte_0
    62. FROM temp.temp_lzl_user
    63. ) t1
    64. JOIN
    65. (
    66. SELECT user_id
    67. ,coalesce(split(info['2019-01'],'_')[0],0) 201901_amt
    68. ,coalesce(split(info['2019-01'],'_')[1],0) 201901_num
    69. ,coalesce(split(info['2019-02'],'_')[0],0) 201902_amt
    70. ,coalesce(split(info['2019-02'],'_')[1],0) 201902_num
    71. ,coalesce(split(info['2019-03'],'_')[0],0) 201903_amt
    72. ,coalesce(split(info['2019-03'],'_')[1],0) 201903_num
    73. ,coalesce(split(info['2019-04'],'_')[0],0) 201904_amt
    74. ,coalesce(split(info['2019-04'],'_')[1],0) 201904_num
    75. ,coalesce(split(info['2019-05'],'_')[0],0) 201905_amt
    76. ,coalesce(split(info['2019-05'],'_')[1],0) 201905_num
    77. ,coalesce(split(info['2019-06'],'_')[0],0) 201906_amt
    78. ,coalesce(split(info['2019-06'],'_')[1],0) 201906_num
    79. ,coalesce(split(info['2019-07'],'_')[0],0) 201907_amt
    80. ,coalesce(split(info['2019-07'],'_')[1],0) 201907_num
    81. ,coalesce(split(info['2019-08'],'_')[0],0) 201908_amt
    82. ,coalesce(split(info['2019-08'],'_')[1],0) 201908_num
    83. ,coalesce(split(info['2019-09'],'_')[0],0) 201909_amt
    84. ,coalesce(split(info['2019-09'],'_')[1],0) 201909_num
    85. ,coalesce(split(info['2019-10'],'_')[0],0) 201910_amt
    86. ,coalesce(split(info['2019-10'],'_')[1],0) 201910_num
    87. ,coalesce(split(info['2019-11'],'_')[0],0) 201911_amt
    88. ,coalesce(split(info['2019-11'],'_')[1],0) 201911_num
    89. ,coalesce(split(info['2019-12'],'_')[0],0) 201912_amt
    90. ,coalesce(split(info['2019-12'],'_')[1],0) 201912_num
    91. ,coalesce(split(info['2020-01'],'_')[0],0) 202001_amt
    92. ,coalesce(split(info['2020-01'],'_')[1],0) 202001_num
    93. ,coalesce(split(info['2020-02'],'_')[0],0) 202002_amt
    94. ,coalesce(split(info['2020-02'],'_')[1],0) 202002_num
    95. ,coalesce(split(info['2020-03'],'_')[0],0) 202003_amt
    96. ,coalesce(split(info['2020-03'],'_')[1],0) 202003_num
    97. ,coalesce(split(info['2020-04'],'_')[0],0) 202004_amt
    98. ,coalesce(split(info['2020-04'],'_')[1],0) 202004_num
    99. ,coalesce(split(info['2020-05'],'_')[0],0) 202005_amt
    100. ,coalesce(split(info['2020-05'],'_')[1],0) 202005_num
    101. ,coalesce(split(info['2020-06'],'_')[0],0) 202006_amt
    102. ,coalesce(split(info['2020-06'],'_')[1],0) 202006_num
    103. ,coalesce(split(info['2020-07'],'_')[0],0) 202007_amt
    104. ,coalesce(split(info['2020-07'],'_')[1],0) 202007_num
    105. ,coalesce(split(info['2020-08'],'_')[0],0) 202008_amt
    106. ,coalesce(split(info['2020-08'],'_')[1],0) 202008_num
    107. ,coalesce(split(info['2020-09'],'_')[0],0) 202009_amt
    108. ,coalesce(split(info['2020-09'],'_')[1],0) 202009_num
    109. ,coalesce(split(info['2020-10'],'_')[0],0) 202010_amt
    110. ,coalesce(split(info['2020-10'],'_')[1],0) 202010_num
    111. FROM t
    112. ) t3
    113. ON t1.user_id = t3.user_id
    114. LEFT JOIN
    115. (
    116. SELECT user_id
    117. ,COUNT(o1.order_id) 2018_order_num
    118. ,SUM(o1.order_amt)/100 2018_order_amt
    119. FROM dw.s01_or_order o1
    120. JOIN dim.s01_dim_shop s1
    121. ON o1.shop_id = s1.shop_cd AND s1.ds = '2020-10-31' AND s1.city_nm IN ('深圳市', '东莞市')
    122. JOIN dim.s01_dim_order_source s2
    123. ON s2.order_source_cd = o1.order_src_cd AND s2.source_cata_id = '1'
    124. WHERE o1.ds = '2020-10-31'
    125. AND o1.cplt_tm < '2019-01-01'
    126. AND o1.order_status_cd = 9
    127. GROUP BY user_id
    128. ) t2
    129. ON t3.user_id = t2.user_id
    130. ;
    1. --笛卡尔积
    2. DROP TABLE temp.temp_shop_lt_result;
    3. CREATE TABLE temp.temp_shop_lt_result AS
    4. with temp_shop_lt_arr AS (
    5. SELECT *
    6. ,split("1,2,3,4,5,6,7,8,9,10",",") AS num_arr
    7. FROM temp.temp_shop_lt)
    8. SELECT shop_id
    9. ,shop_nm
    10. ,shop_lgtud
    11. ,shop_lttud
    12. ,area_lgtud
    13. ,area_lttud
    14. ,minLat
    15. ,minLng
    16. ,maxLat
    17. ,maxLng
    18. ,num_key
    19. FROM temp_shop_lt_arr lateral view explode
    20. (num_arr) tb_view AS num_key;
    21. DROP TABLE temp.temp_s49_shop_key;
    22. CREATE TABLE temp.temp_s49_shop_key AS
    23. SELECT shop_key
    24. ,shop_name
    25. ,longitude
    26. ,latitude
    27. ,source
    28. ,ceiling(rand()*9) AS num_key
    29. FROM dw.s49_shop_info;
    30. use DW;
    31. set mapred.output.compress=true;
    32. set hive.exec.compress.output=true;
    33. set mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec;
    34. set mapred.job.queue.name=root.mapreduce.phive;
    35. set hive.auto.convert.join=false;
    36. set mapred.reduce.tasks = 10;
    37. set hive.optimize.skewjoin=true;
    38. INSERT OVERWRITE TABLE dim.s49_dim_shop_mapping partition (ds = '2020-12-13')
    39. SELECT t2.source
    40. ,t1.shop_id
    41. ,t1.shop_nm
    42. ,t1.shop_lgtud
    43. ,t1.shop_lttud
    44. ,t2.shop_key
    45. ,t2.shop_name
    46. ,t2.longitude
    47. ,t2.latitude
    48. FROM
    49. (
    50. SELECT shop_id
    51. ,shop_nm
    52. ,shop_lgtud
    53. ,shop_lttud
    54. ,area_lgtud
    55. ,area_lttud
    56. ,maxLng
    57. ,maxLat
    58. ,minLat
    59. ,minLng
    60. ,num_key
    61. FROM temp.temp_shop_lt_result
    62. ) t1
    63. JOIN
    64. (
    65. SELECT shop_key
    66. ,shop_name
    67. ,longitude
    68. ,latitude
    69. ,num_key
    70. ,source
    71. FROM temp.temp_s49_shop_key
    72. ) t2
    73. ON t1.num_key = t2.num_key
    74. WHERE t2.longitude >= t1.minLng
    75. AND t2.longitude <= t1.maxLng
    76. AND t2.latitude >= t1.minLat
    77. AND t2.latitude <= t1.maxLat
    78. ;

    hive Sql中的高级应用:https://blog.csdn.net/qq_41712271/article/details/109224538
    hive中percentile, percentile_approx

    复杂udf函数编写:

    1. package com.ddky.bi;
    2. import cn.hutool.core.util.NumberUtil;
    3. import cn.hutool.core.util.ReUtil;
    4. import cn.hutool.core.util.StrUtil;
    5. import com.ddky.bi.utils.CommonUtil;
    6. import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
    7. import org.apache.hadoop.hive.ql.metadata.HiveException;
    8. import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
    9. import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
    10. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
    11. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
    12. import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
    13. import org.apache.hadoop.io.FloatWritable;
    14. import org.apache.hadoop.io.IntWritable;
    15. import org.apache.hadoop.io.Text;
    16. import java.util.ArrayList;
    17. import java.util.List;
    18. /**
    19. * Created by yuanhu on 2021/7/12 17:29
    20. */
    21. public class GGProdMatchUDF extends GenericUDF {
    22. private Object[] result;
    23. private ListObjectInspector listObjectInspector;
    24. private String orgProdNm;
    25. private List paramList;
    26. @Override
    27. public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
    28. if (objectInspectors.length != 2){
    29. throw new UDFArgumentException("参数缺失,本函数至少俩个参数!");
    30. }
    31. if (!objectInspectors[1].getCategory().equals(ObjectInspector.Category.LIST)){
    32. throw new UDFArgumentException("参数错误,本函数第二个参数必须是List类型!");
    33. }
    34. listObjectInspector = (ListObjectInspector) objectInspectors[1];
    35. List<String> structFieldNames = new ArrayList<>();
    36. List<ObjectInspector> structFieldObjectInspector = new ArrayList<>();
    37. structFieldNames.add("compete_shop_name");
    38. structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
    39. structFieldNames.add("compete_shop_key");
    40. structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
    41. structFieldNames.add("compete_prod_name");
    42. structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
    43. structFieldNames.add("compete_price");
    44. structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableFloatObjectInspector);
    45. structFieldNames.add("compete_monthly_sales");
    46. structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableIntObjectInspector);
    47. structFieldNames.add("compete_deliver_type");
    48. structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
    49. return ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames, structFieldObjectInspector);
    50. }
    51. @Override
    52. public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
    53. //获取top商品名称
    54. orgProdNm = deferredObjects[0].get().toString();
    55. //获取竞对商品列表
    56. paramList = listObjectInspector.getList(deferredObjects[1].get());
    57. //处理top商品字符串
    58. orgProdNm = prepareProcessProdNm(orgProdNm);
    59. //第一步:将名称完全匹配的商品直接输出,匹配结束
    60. String firstResult = paramList.stream().filter(paramStr -> orgProdNm.equals(prepareProcessProdNm(paramStr.toString().split("####")[2]))).findFirst().orElse("").toString();
    61. if (StrUtil.isNotBlank(firstResult)){
    62. return firstResult;
    63. }
    64. //第二步:如果名称不能完全匹配,则考虑规格乘积是否一样如果相同,视为匹配成功
    65. String secondResult = paramList.stream().filter(paramStr -> {
    66. String comProdNm = prepareProcessProdNm(paramStr.toString().split("####")[2]);
    67. //将字符串中中文数字转化为数字,此步骤为了将“五盒装”这类排除出去
    68. orgProdNm = CommonUtil.zh2num(orgProdNm);
    69. comProdNm = CommonUtil.zh2num(comProdNm);
    70. List<String> topProdValues = ReUtil.findAll("(\\d+(\\.\\d+)?)", orgProdNm, 0, new ArrayList<String>());
    71. List<String> comProdValues = ReUtil.findAll("(\\d+(\\.\\d+)?)", comProdNm, 0, new ArrayList<String>());
    72. double topValue = 1.0;
    73. double comValue = 1.0;
    74. for (String s : topProdValues){
    75. topValue = topValue * Double.parseDouble(s);
    76. }
    77. for (String s : comProdValues){
    78. comValue = comValue * Double.parseDouble(s);
    79. }
    80. if (NumberUtil.compare(topValue, comValue) == 0)
    81. return true;
    82. return false;
    83. }).findFirst().orElse("").toString();
    84. return toResult(secondResult);
    85. }
    86. @Override
    87. public String getDisplayString(String[] strings) {
    88. return null;
    89. }
    90. private Object toResult(String resultStr){
    91. result = new Object[6];
    92. if (StrUtil.isBlank(resultStr)){
    93. return resultStr;
    94. }
    95. String[] arrayResult = resultStr.split("####");
    96. result[0] = new Text(arrayResult[0]);
    97. result[1] = new Text(arrayResult[1]);
    98. result[2] = new Text(arrayResult[2]);
    99. result[3] = new FloatWritable(Float.parseFloat(arrayResult[3]));
    100. result[4] = new IntWritable(Integer.parseInt(arrayResult[4]));
    101. result[5] = new Text(arrayResult[5]);
    102. return result;
    103. }
    104. private String prepareProcessProdNm(String prodNm){
    105. if (StrUtil.isBlank(prodNm)){
    106. return "";
    107. }
    108. //处理特殊字符
    109. prodNm = prodNm.replaceAll("\\s|\\*|×", "");
    110. //将名字中带满减字样的字符串去掉,防止计算规格出现问题
    111. List<String> list = ReUtil.findAll("\\【(.*?)】", prodNm, 0, new ArrayList<>());
    112. for (String str : list){
    113. if (str.indexOf("满") != -1 && str.indexOf("减") != -1){
    114. prodNm = prodNm.replace(str, "");
    115. }
    116. }
    117. //将一些无意义的单位去除
    118. String[] unitStr = {"/板", "/盒", "/瓶", "/支", "/袋", "/桶", "/包", "/吸", "/台"};
    119. for (String str : unitStr){
    120. prodNm = StrUtil.removeAll(prodNm, str);
    121. }
    122. return prodNm;
    123. }
    124. }