数据检查
数据维度(行列)
# 查看table_name的记录条数SELECT COUNT(*) FROM table_name;# 查看table_name的字段数量SELECT COUNT(*) as column_num FROM information_schema.`COLUMNS`WHERE TABLE_NAME='table_name';
表信息
DESC命令查看数据表的整体信息
# 查看table_name表信息DESC table_name;# 显示信息:Field、Type、Null、Key、Default、Extra
查找空值
Mysql中可以使用IS NULL来判断空值。
# 查看price为空的数据SELECT * FROM table_name WHERE price IS NULL;# 查看price为0的数据SELECT * FROM table_name WHERE price=0;
查看唯一值
# 查询单列唯一值SELECT DISTINCT (city)FROM table_name;
查看列名称
Mysql中使用COLUMNS函数用来单独查看数据表中的列名称。(与DESC查询结果一致)
# 查看数据表列名称SHOW COLUMNS FROM table_name;
数据清洗
处理缺失值(填充)
选择填充的方式来处理空值,使用price列的均值来填充0值字段。 具体分为两个步骤:第一步计算table_name数据表中price列的均值,并保留两位小数。第二步使用price列的均值更新price列中0值的字段。
# 计算price列的均值SELECT ROUND(AVG(price),2) AS avg_price FROM table_name;# 使用均值填充0值UPDATE table_name SET price=2199.67 WHERE price=0;
清理空格
# 清理字符中的空格UPDATE table_name SET city = TRIM(city);
清理空格还有其它函数,详情列表请查看体系文档。
数值修改及替换
# 在table_name表中,将city字段中含有的“成都”替换为“成都市”。UPDATE table_name SET city = REPLACE(city,'成都','成都市');
百分比
-- CONCAT('a/b','%')代表拼接%SELECT CONCAT(CAST(round((1/2)*100,2) AS CHAR),'%') AS percentage FROM table_name;
判断多行
SELECTid_card,CASEid_nameWHEN cast(locate(',', id_name) AS SIGNED) > 0 THEN"姓名重复" ELSE '' ENDAS duplication_name,CASEclassWHEN CAST(locate(',', class)AS SIGNED) > 0 THEN"年级重复" ELSE '' ENDAS duplication_classFROM( SELECT id_card, GROUP_CONCAT( DISTINCT name ) AS id_name, GROUP_CONCAT( DISTINCT class ) AS class FROM `student` GROUP BY id_card ) AS t
数据预处理
数据表匹配合并
类似Excel中的VLOOKUP()函数和Power Query的合并查询功能。 其中与Power Query的“左外部、右外部、完全外部等”有类似功能,SQL表连接JOIN匹配常用的模式有INNER JOIN,LEFT JOIN,和RIGHT JOIN。
/* 为便于理解 */# 内连接SELECT 学员信息表.*, 学员成绩表.*FROM 学员信息表 INNER JOIN 学员成绩表 ON 学员信息表.学号 = 学员成绩表.学号;# 左连接SELECT 学员信息表.*, 学员成绩表.*FROM 学员信息表 LEFT JOIN 学员成绩表 ON 学员信息表.学号 = 学员成绩表.学号;# 右连接SELECT 学员信息表.*, 学员成绩表.*FROM 学员信息表 RIGHT JOIN 学员成绩表 ON 学员信息表.学号 = 学员成绩表.学号;
排序
# 按price 升序排列SELECT * FROM table_name ORDER BY price;# 按price 降序排列SELECT * FROM table_name ORDER BY price DESC;
数据分组
/* 为了便于理解,贴一个我曾经的写过的语句 */-- 通过日期按星期分类汇总SELECT `区域城市`,`宴会编码`,`跟单员`,`日期`,CASE DAYNAME(日期)WHEN 'Monday' THEN '星期一'WHEN 'Tuesday' THEN '星期二'WHEN 'Wednesday' THEN '星期三'WHEN 'Thursday' THEN '星期四'WHEN 'Friday' THEN '星期五'WHEN 'Saturday' THEN '星期六'WHEN 'Sunday' THEN '星期日'ELSE '其他' ENDas 星期,时间,`桌数`,`购酒数`,`宴会类型`FROM `宴会基础信息表`-- 直接分组查询并汇总SELECT COUNT(id) AS id_count, SUM(price)AS total_price,CASEWHEN age<30 THEN 'A'WHEN age>=30 AND age<50 THEN 'B'WHEN age>=50 THEN 'C'ELSE 'D' END AS age_typeFROM table_name GROUP BY age_type ORDER BY id_count;
数据分列
# category依据 “-” 分列SELECT SUBSTRING_INDEX(category,'-',1)AS size,SUBSTRING_INDEX(category,'-',-1)AS colourFROM table_name;# 按分列后的结果进行单列数据汇总SELECT SUBSTRING_INDEX(category,'-',1) AS size,COUNT(id)FROM table_name GROUP BY size;# 按分列后的结果进行多列数据汇总SELECTSUBSTRING_INDEX(category,'-',1) AS size,COUNT(id) AS id_count,ROUND(SUM(price),2) AS total_priceFROM table_name GROUP BY size;# 更新分列后的字段内容UPDATE table_name SET size = SUBSTRING_INDEX(category,'-',1),colour =SUBSTRING_INDEX(category,'-',-1);
数据提取
-- 按列提取SELECT city FROM table_name;-- 按行提取SELECT * FROM table_name WHERE city='成都';-- 按行提取# 提取2到5行(4行)SELECT * FROM table_name LIMIT 2,5;-- 按条件提取并计算SELECT AVG(price) FROM table_name WHERE city='成都' AND price < 19.88;-- 从JSON中提取SELECT JSON_EXTRACT('{"name":"沐之杰","age":"13240133388"}',"$.name");
复杂提取
-- 38°珍藏级剑南春(名烟酒)[500ml]# 假设table_name表中的col_name列有这样一个值:38°珍藏级剑南春(名烟酒)[500ml],需要分别提取38°、珍藏级剑南春、名烟酒、500mlSELECTSUBSTRING_INDEX(col_name,'°',1) as 度数, -- 38SUBSTRING_INDEX(SUBSTRING_INDEX(col_name,'(',1),'°',-1) as 品名, -- 珍藏级剑南春SUBSTRING_INDEX(col_name,')',-1) as 规格, -- [500ml]SUBSTRING_INDEX(SUBSTRING_INDEX(col_name,"(",-1),')',1) as 版本, -- 名烟酒SUBSTRING_INDEX(SUBSTRING_INDEX(col_name,"[",-1),']',1) as 容量 -- 500mlFROM table_name
数据筛选
按条件筛选需要用到算术运算、逻辑运算符以及LIKE通配符等。
# 筛选city不等于成都的SELECT * FROM table_name WHERE city !='成都';# 筛选name(模糊筛选)中以“王”开头的SELECT * FROM table_name WHERE city LIKE '王%';#筛选后计数SELECT COUNT(id) AS id_count FROM table_name WHERE city='成都'AND price < 19.88;
统计汇总
使用分类汇总可以按特定维度对数据进行统计分析,mysql中使用的主要函数是 GROUP BY、CASE WHEN以及GROUP_CONCAT。
/* 来个复杂点儿的 */# 查询不同尺码下的不同颜色的产品销售金额SELECT sizenote,colornote,goodsid,sum(goodsprice * amount) AS 销售额FROM orderdetail LEFT JOIN goodssize ON orderdetail.sizeid = goodssize.sizeidLEFT JOIN goodscolor ON orderdetail.colorid = goodscolor.coloridGROUP BY sizenote, colornote, goodsidORDER BY sizenote, colornote, 销售额 DESC;# 查询销售大区对应哪些办事处SELECT `销售大区`,GROUP_CONCAT(`办事处`) FROM `2018年某牌销售表` GROUP BY `销售大区`;
按条件统计
select count(if(mt=0,1,null)) as a,count(if(mt>0,1,null)) as b from tableselect sum(mt=0) as a,sum(mt>0) as b from table
多表连接
-- 3表右连接,空值处理SELECT * FROM (SELECT * FROM (SELECT t1.ymd,t1.skbs,t2.zdqy,SUM(t1.skbs + IFNULL(t2.zdqy,0)) AS '总计' FROM((SELECT DATE_FORMAT(submit_time,'%Y-%m-%d')AS ymd, count(*) AS 'skbs'from skbs_risk_entry WHERE remark LIKE '%陕西%' GROUP BY ymd ORDER BY ymd) AS t1LEFT JOIN(SELECT DATE_FORMAT(submit_time,'%Y-%m-%d')AS ymd, count(*) AS 'zdqy'from zdqy_risk_entry WHERE remark LIKE '%西安%' GROUP BY ymd ORDER BY ymd) AS t2ON t1.ymd = t2.ymd) GROUP BY t1.ymd,t1.skbs,t2.zdqy) AS t11LEFT JOIN(SELECT DATE_FORMAT(submit_time,'%Y-%m-%d') AS ymdsm, count(*) AS 'sm'from sm_risk_entry WHERE remark LIKE '%西安%' GROUP BY ymdsm) AS t12ON t11.ymd = t12.ymdsm) AS t23
数据转换
日期格式
/* Mysql */SELECT DATE_FORMAT('2021-01-01','%Y-%m-%d %H:%i:%s') --2021-01-01 00:00:00/* Postgresql */SELECT to_timestamp('2012-05-01','yyyy-mm-dd hh24:mi:ss') -- 2012-05-01 00:00:00+08select date '2018-03-05';select '2018-03-05'::date;select to_date('2018-03-05', 'YYYY-MM-DD');select timestamp '2018-03-05 10:23:43';select '2018-03-05 10:23:43'::timestamp;select '2018-03-05'::timestamp;select to_timestamp('2018-03-05 13:23:45.485', 'YYYY-MM-DD HH24:MI:SS.MS');-- 处理日期格式为日期+时间UPDATE scjdgl_ztjbxx_gx SET create_time = create_time::timestampSELECT to_date('202201','YYYYMMDD'); -- 2022-01-01
