1、将hive数据导出到csv:
julia /dwd/bak/yuanhu/julia/tsv_2_excel.jl "订单号,下单日期,业务类型,店铺名称,店铺ID,评价日期,评分,商品编码,SKUID,品牌ID,品牌,商品名称" "string,string,string,string,string,string,string,string,string,string,string,string" "/dwd/bak/张楠池/data/分析POP订单商品评价分析_2021-11-29.tsv" "/dwd/bak/张楠池/data/分析POP订单商品评价分析_2021-11-29.xlsx"hive -e 'select books from table' | sed 's/[[:space:]]\+/,/g' > /home/lvermeer/temp.csvbeeline -u jdbc:hive2://172.16.16.7:10000 -n dwetl -p ddky@admin --verbose=true --outputformat=tsv2 -e "drop table temp.tmp_aa; create table temp.tmp_aa as select order_id, order_amt from dw.s01_or_order where ds = '2021-11-04' limit 30; select * from temp.tmp_aa;" > /dwd/bak/yuanhu/data/data.csv
2、yarn 上查看日志
yarn logs -applicationId application_1517538889175_2550 > logs.txt##杀死任务yarn application --kill application_1605802776975_1523527
3、hive显示列名,表头
set hive.cli.print.header=true;
4、hive 常用优化设置
--join方面的优化set hive.exec.mode.local.auto=true;SET hive.auto.convert.join = TRUE;SET hive.optimize.correlation = TRUE;SET hive.auto.convert.join.noconditionaltask = TRUE;SET hive.exec.parallel = TRUE;--join倾斜优化set hive.optimize.skewjoin=true;set hive.skewjoin.key=100000;--调整内存set mapred.reduce.tasks = 20;set mapreduce.map.memory.mb = 4096;set mapreduce.reduce.memory.mb = 4096;set mapreduce.map.java.opts=-Xmx3278m;set mapreduce.reduce.java.opts=-Xmx3278m;--map数量控制set mapred.max.split.size=100000000;set mapred.min.split.size.per.node=100000000;set mapred.min.split.size.per.rack=100000000;set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; --控制map前合并小文件--reduce数控制set hive.exec.reducers.bytes.per.reducer=100000000;set mapred.reduce.tasks = 150;--谓词下推,尽早对底层数据进行过滤以减少后续以减少后续需要处理的数据量set hive.optimize.ppd=true;--在map中会做部分聚合操作,能够使map传送给reduce的数据量大大减少set hive.map.aggr=true;set hive.groupby.skewindata=true;--关于hive union all的优化Hive count,distinct,group by会导致union all 同表关联扫描多次表,multi insert 不会,可以通过调整count,distinct,group by 位置,先执行union all后进行count,distinct,group by来避免这种情况发生。--大表与大表关联优化set hive.auto.convert.sortmerge.join=true;set hive.optimize.bucketmapjoin = true;set hive.optimize.bucketmapjoin.sortedmerge = true;set hive.auto.convert.sortmerge.join.noconditionaltask=true;--hive on sprak 合并小文件set hive.merge.sparkfiles=true;set hive.merge.size.per.task = 256*1000*1000;
5、hive建表语句
create table temp.tmp_phone_dr (phone string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;load data local inpath '/dwd/bak/yuanhu/20210206/phone.txt' overwrite into table temp.tmp_phone_dr;
5、hive添加字段
ALTER TABLE table_nm SET TBLPROPERTIES ('external.table.purge'='true');--增加cascade强制刷新元数据alter table table_name add columns (c_time string comment '当前时间') cascade; -- 正确,添加在最后alter table table_name change c_time c_time string after address ; -- 正确,移动到指定位置,address字段的后面alter table table_name drop if exists partition(par_col=col_name); --删除分区Alter table 表名 change column 原字段名称 现字段名称 数据类型ALTER TABLE table_name SET TBLPROPERTIES('comment' = '表的新注释');ALTER TABLE test REPLACE COLUMNS(id BIGINT, name STRING);--mysqlALTER TABLE device ADD COLUMN `device_type` VARCHAR(32) DEFAULT NULL COMMENT '设备类型';ALTER TABLE 表名 CHANGE 字段名 新字段名 字段属性--索引alter table table_name add index index_name (column_list) ;create unique index id_index on s49_app_compete_prod_data_monitor(identifies, stat_dd);--新建主键alter table `s49_app_shop_all_prod` add primary key (`identifies`);
ADD JAR /home/dwetl/hive_udf-1.0-SNAPSHOT.jar;CREATE temporary function getMonthDays AS 'com.ddky.bi.GetMonthDay';hdfs://172.16.16.12:8888/user/ADD JAR hdfs://bidc/bi/udf_pkg/gettimeendpoit.jar;CREATE temporary function getTimeEndpoit AS 'com.ddky.bi.GetTimeEndpoit';--添加永久函数create function is_blank as 'com.ddky.bi.StringIsBlank' using jar 'hdfs://bidc/bi/udf_pkg/hive_udf_permanent.jar';create function replace_blank as 'com.ddky.bi.ReplaceBlankUDF' using jar 'hdfs://bidc/bi/udf_pkg/replace_bland_udf.jar';beeline -u "jdbc:hive2://172.16.16.7:10000" -ndwetl -pddky@admin
保留小数点后四位 * 100,连接%
concat(round(x/y, 4) * 100, '%')
--hive 2.1.1 之后 jdbc日志获取<property><name>hive.async.log.enabled</name><value>false</value></property>
hive 利用反射调取java 代码
select reflect("org.apache.commons.lang.StringUtils", "isNotBlank", "");
行转列代码:
with t AS (SELECT user_id,str_to_map(concat_ws(',',collect_list(concat_ws(':',mm,concat(order_amt,'_',order_num))))) infoFROM temp.temp_lzl_2019_orderGROUP BY user_id)SELECT t1.user_id `用户ID`,t1.create_tm `用户注册时间`,t1.first_tm `首单时间`,t1.is_lte_0 `首单是否小于等于1元`,t2.2018_order_num `截止2018年12月订单数`,t2.2018_order_amt `截止2018年12月订单金额`,t3.201901_num `2019年1月订单数`,t3.201901_amt `2019年1月金额`,t3.201902_num `2019年2月订单数`,t3.201902_amt `2019年2月金额`,t3.201903_num `2019年3月订单数`,t3.201903_amt `2019年3月金额`,t3.201904_num `2019年4月订单数`,t3.201904_amt `2019年4月金额`,t3.201905_num `2019年5月订单数`,t3.201905_amt `2019年5月金额`,t3.201906_num `2019年6月订单数`,t3.201906_amt `2019年6月金额`,t3.201907_num `2019年7月订单数`,t3.201907_amt `2019年7月金额`,t3.201908_num `2019年8月订单数`,t3.201908_amt `2019年8月金额`,t3.201909_num `2019年9月订单数`,t3.201909_amt `2019年9月金额`,t3.201910_num `2019年10月订单数`,t3.201910_amt `2019年10月金额`,t3.201911_num `2019年11月订单数`,t3.201911_amt `2019年11月金额`,t3.201912_num `2019年12月订单数`,t3.201912_amt `2019年12月金额`,t3.202001_num `2020年1月订单数`,t3.202001_amt `2020年1月金额`,t3.202002_num `2020年2月订单数`,t3.202002_amt `2020年2月金额`,t3.202003_num `2020年3月订单数`,t3.202003_amt `2020年3月金额`,t3.202004_num `2020年4月订单数`,t3.202004_amt `2020年4月金额`,t3.202005_num `2020年5月订单数`,t3.202005_amt `2020年5月金额`,t3.202006_num `2020年6月订单数`,t3.202006_amt `2020年6月金额`,t3.202007_num `2020年7月订单数`,t3.202007_amt `2020年7月金额`,t3.202008_num `2020年8月订单数`,t3.202008_amt `2020年8月金额`,t3.202009_num `2020年9月订单数`,t3.202009_amt `2020年9月金额`,t3.202010_num `2020年10月订单数`,t3.202010_amt `2020年10月金额`FROM(SELECT user_id,create_tm,first_tm,is_lte_0FROM temp.temp_lzl_user) t1JOIN(SELECT user_id,coalesce(split(info['2019-01'],'_')[0],0) 201901_amt,coalesce(split(info['2019-01'],'_')[1],0) 201901_num,coalesce(split(info['2019-02'],'_')[0],0) 201902_amt,coalesce(split(info['2019-02'],'_')[1],0) 201902_num,coalesce(split(info['2019-03'],'_')[0],0) 201903_amt,coalesce(split(info['2019-03'],'_')[1],0) 201903_num,coalesce(split(info['2019-04'],'_')[0],0) 201904_amt,coalesce(split(info['2019-04'],'_')[1],0) 201904_num,coalesce(split(info['2019-05'],'_')[0],0) 201905_amt,coalesce(split(info['2019-05'],'_')[1],0) 201905_num,coalesce(split(info['2019-06'],'_')[0],0) 201906_amt,coalesce(split(info['2019-06'],'_')[1],0) 201906_num,coalesce(split(info['2019-07'],'_')[0],0) 201907_amt,coalesce(split(info['2019-07'],'_')[1],0) 201907_num,coalesce(split(info['2019-08'],'_')[0],0) 201908_amt,coalesce(split(info['2019-08'],'_')[1],0) 201908_num,coalesce(split(info['2019-09'],'_')[0],0) 201909_amt,coalesce(split(info['2019-09'],'_')[1],0) 201909_num,coalesce(split(info['2019-10'],'_')[0],0) 201910_amt,coalesce(split(info['2019-10'],'_')[1],0) 201910_num,coalesce(split(info['2019-11'],'_')[0],0) 201911_amt,coalesce(split(info['2019-11'],'_')[1],0) 201911_num,coalesce(split(info['2019-12'],'_')[0],0) 201912_amt,coalesce(split(info['2019-12'],'_')[1],0) 201912_num,coalesce(split(info['2020-01'],'_')[0],0) 202001_amt,coalesce(split(info['2020-01'],'_')[1],0) 202001_num,coalesce(split(info['2020-02'],'_')[0],0) 202002_amt,coalesce(split(info['2020-02'],'_')[1],0) 202002_num,coalesce(split(info['2020-03'],'_')[0],0) 202003_amt,coalesce(split(info['2020-03'],'_')[1],0) 202003_num,coalesce(split(info['2020-04'],'_')[0],0) 202004_amt,coalesce(split(info['2020-04'],'_')[1],0) 202004_num,coalesce(split(info['2020-05'],'_')[0],0) 202005_amt,coalesce(split(info['2020-05'],'_')[1],0) 202005_num,coalesce(split(info['2020-06'],'_')[0],0) 202006_amt,coalesce(split(info['2020-06'],'_')[1],0) 202006_num,coalesce(split(info['2020-07'],'_')[0],0) 202007_amt,coalesce(split(info['2020-07'],'_')[1],0) 202007_num,coalesce(split(info['2020-08'],'_')[0],0) 202008_amt,coalesce(split(info['2020-08'],'_')[1],0) 202008_num,coalesce(split(info['2020-09'],'_')[0],0) 202009_amt,coalesce(split(info['2020-09'],'_')[1],0) 202009_num,coalesce(split(info['2020-10'],'_')[0],0) 202010_amt,coalesce(split(info['2020-10'],'_')[1],0) 202010_numFROM t) t3ON t1.user_id = t3.user_idLEFT JOIN(SELECT user_id,COUNT(o1.order_id) 2018_order_num,SUM(o1.order_amt)/100 2018_order_amtFROM dw.s01_or_order o1JOIN dim.s01_dim_shop s1ON o1.shop_id = s1.shop_cd AND s1.ds = '2020-10-31' AND s1.city_nm IN ('深圳市', '东莞市')JOIN dim.s01_dim_order_source s2ON s2.order_source_cd = o1.order_src_cd AND s2.source_cata_id = '1'WHERE o1.ds = '2020-10-31'AND o1.cplt_tm < '2019-01-01'AND o1.order_status_cd = 9GROUP BY user_id) t2ON t3.user_id = t2.user_id;
--笛卡尔积DROP TABLE temp.temp_shop_lt_result;CREATE TABLE temp.temp_shop_lt_result ASwith temp_shop_lt_arr AS (SELECT *,split("1,2,3,4,5,6,7,8,9,10",",") AS num_arrFROM temp.temp_shop_lt)SELECT shop_id,shop_nm,shop_lgtud,shop_lttud,area_lgtud,area_lttud,minLat,minLng,maxLat,maxLng,num_keyFROM temp_shop_lt_arr lateral view explode(num_arr) tb_view AS num_key;DROP TABLE temp.temp_s49_shop_key;CREATE TABLE temp.temp_s49_shop_key ASSELECT shop_key,shop_name,longitude,latitude,source,ceiling(rand()*9) AS num_keyFROM dw.s49_shop_info;use DW;set mapred.output.compress=true;set hive.exec.compress.output=true;set mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec;set mapred.job.queue.name=root.mapreduce.phive;set hive.auto.convert.join=false;set mapred.reduce.tasks = 10;set hive.optimize.skewjoin=true;INSERT OVERWRITE TABLE dim.s49_dim_shop_mapping partition (ds = '2020-12-13')SELECT t2.source,t1.shop_id,t1.shop_nm,t1.shop_lgtud,t1.shop_lttud,t2.shop_key,t2.shop_name,t2.longitude,t2.latitudeFROM(SELECT shop_id,shop_nm,shop_lgtud,shop_lttud,area_lgtud,area_lttud,maxLng,maxLat,minLat,minLng,num_keyFROM temp.temp_shop_lt_result) t1JOIN(SELECT shop_key,shop_name,longitude,latitude,num_key,sourceFROM temp.temp_s49_shop_key) t2ON t1.num_key = t2.num_keyWHERE t2.longitude >= t1.minLngAND t2.longitude <= t1.maxLngAND t2.latitude >= t1.minLatAND t2.latitude <= t1.maxLat;
hive Sql中的高级应用:https://blog.csdn.net/qq_41712271/article/details/109224538
hive中percentile, percentile_approx
复杂udf函数编写:
package com.ddky.bi;import cn.hutool.core.util.NumberUtil;import cn.hutool.core.util.ReUtil;import cn.hutool.core.util.StrUtil;import com.ddky.bi.utils.CommonUtil;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.io.FloatWritable;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import java.util.ArrayList;import java.util.List;/*** Created by yuanhu on 2021/7/12 17:29*/public class GGProdMatchUDF extends GenericUDF {private Object[] result;private ListObjectInspector listObjectInspector;private String orgProdNm;private List paramList;@Overridepublic ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {if (objectInspectors.length != 2){throw new UDFArgumentException("参数缺失,本函数至少俩个参数!");}if (!objectInspectors[1].getCategory().equals(ObjectInspector.Category.LIST)){throw new UDFArgumentException("参数错误,本函数第二个参数必须是List类型!");}listObjectInspector = (ListObjectInspector) objectInspectors[1];List<String> structFieldNames = new ArrayList<>();List<ObjectInspector> structFieldObjectInspector = new ArrayList<>();structFieldNames.add("compete_shop_name");structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);structFieldNames.add("compete_shop_key");structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);structFieldNames.add("compete_prod_name");structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);structFieldNames.add("compete_price");structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableFloatObjectInspector);structFieldNames.add("compete_monthly_sales");structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableIntObjectInspector);structFieldNames.add("compete_deliver_type");structFieldObjectInspector.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);return ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames, structFieldObjectInspector);}@Overridepublic Object evaluate(DeferredObject[] deferredObjects) throws HiveException {//获取top商品名称orgProdNm = deferredObjects[0].get().toString();//获取竞对商品列表paramList = listObjectInspector.getList(deferredObjects[1].get());//处理top商品字符串orgProdNm = prepareProcessProdNm(orgProdNm);//第一步:将名称完全匹配的商品直接输出,匹配结束String firstResult = paramList.stream().filter(paramStr -> orgProdNm.equals(prepareProcessProdNm(paramStr.toString().split("####")[2]))).findFirst().orElse("").toString();if (StrUtil.isNotBlank(firstResult)){return firstResult;}//第二步:如果名称不能完全匹配,则考虑规格乘积是否一样如果相同,视为匹配成功String secondResult = paramList.stream().filter(paramStr -> {String comProdNm = prepareProcessProdNm(paramStr.toString().split("####")[2]);//将字符串中中文数字转化为数字,此步骤为了将“五盒装”这类排除出去orgProdNm = CommonUtil.zh2num(orgProdNm);comProdNm = CommonUtil.zh2num(comProdNm);List<String> topProdValues = ReUtil.findAll("(\\d+(\\.\\d+)?)", orgProdNm, 0, new ArrayList<String>());List<String> comProdValues = ReUtil.findAll("(\\d+(\\.\\d+)?)", comProdNm, 0, new ArrayList<String>());double topValue = 1.0;double comValue = 1.0;for (String s : topProdValues){topValue = topValue * Double.parseDouble(s);}for (String s : comProdValues){comValue = comValue * Double.parseDouble(s);}if (NumberUtil.compare(topValue, comValue) == 0)return true;return false;}).findFirst().orElse("").toString();return toResult(secondResult);}@Overridepublic String getDisplayString(String[] strings) {return null;}private Object toResult(String resultStr){result = new Object[6];if (StrUtil.isBlank(resultStr)){return resultStr;}String[] arrayResult = resultStr.split("####");result[0] = new Text(arrayResult[0]);result[1] = new Text(arrayResult[1]);result[2] = new Text(arrayResult[2]);result[3] = new FloatWritable(Float.parseFloat(arrayResult[3]));result[4] = new IntWritable(Integer.parseInt(arrayResult[4]));result[5] = new Text(arrayResult[5]);return result;}private String prepareProcessProdNm(String prodNm){if (StrUtil.isBlank(prodNm)){return "";}//处理特殊字符prodNm = prodNm.replaceAll("\\s|\\*|×", "");//将名字中带满减字样的字符串去掉,防止计算规格出现问题List<String> list = ReUtil.findAll("\\【(.*?)】", prodNm, 0, new ArrayList<>());for (String str : list){if (str.indexOf("满") != -1 && str.indexOf("减") != -1){prodNm = prodNm.replace(str, "");}}//将一些无意义的单位去除String[] unitStr = {"/板", "/盒", "/瓶", "/支", "/袋", "/桶", "/包", "/吸", "/台"};for (String str : unitStr){prodNm = StrUtil.removeAll(prodNm, str);}return prodNm;}}
