数据检查

数据维度(行列)

  1. # 查看table_name的记录条数
  2. SELECT COUNT(*) FROM table_name;
  3. # 查看table_name的字段数量
  4. SELECT COUNT(*) as column_num FROM information_schema.`COLUMNS`
  5. WHERE TABLE_NAME='table_name';

表信息
DESC命令查看数据表的整体信息

  1. # 查看table_name表信息
  2. DESC table_name;
  3. # 显示信息:Field、Type、Null、Key、Default、Extra

查找空值
Mysql中可以使用IS NULL来判断空值。

  1. # 查看price为空的数据
  2. SELECT * FROM table_name WHERE price IS NULL;
  3. # 查看price为0的数据
  4. SELECT * FROM table_name WHERE price=0;

查看唯一值

  1. # 查询单列唯一值
  2. SELECT DISTINCT (city)FROM table_name;

查看列名称
Mysql中使用COLUMNS函数用来单独查看数据表中的列名称。(与DESC查询结果一致)

  1. # 查看数据表列名称
  2. SHOW COLUMNS FROM table_name;

数据清洗

处理缺失值(填充)
选择填充的方式来处理空值,使用price列的均值来填充0值字段。 具体分为两个步骤:第一步计算table_name数据表中price列的均值,并保留两位小数。第二步使用price列的均值更新price列中0值的字段。

  1. # 计算price列的均值
  2. SELECT ROUND(AVG(price),2) AS avg_price FROM table_name;
  3. # 使用均值填充0值
  4. UPDATE table_name SET price=2199.67 WHERE price=0;

清理空格

  1. # 清理字符中的空格
  2. UPDATE table_name SET city = TRIM(city);

清理空格还有其它函数,详情列表请查看体系文档。

数值修改及替换

  1. # 在table_name表中,将city字段中含有的“成都”替换为“成都市”。
  2. UPDATE table_name SET city = REPLACE(city,'成都','成都市');

百分比

  1. -- CONCAT('a/b','%')代表拼接%
  2. SELECT CONCAT(CAST(round((1/2)*100,2) AS CHAR),'%') AS percentage FROM table_name;

判断多行

  1. SELECT
  2. id_card,
  3. CASE
  4. id_name
  5. WHEN cast(locate(',', id_name) AS SIGNED) > 0 THEN
  6. "姓名重复" ELSE '' END
  7. AS duplication_name,
  8. CASE
  9. class
  10. WHEN CAST(locate(',', class)AS SIGNED) > 0 THEN
  11. "年级重复" ELSE '' END
  12. AS duplication_class
  13. FROM
  14. ( SELECT id_card, GROUP_CONCAT( DISTINCT name ) AS id_name, GROUP_CONCAT( DISTINCT class ) AS class FROM `student` GROUP BY id_card ) AS t

数据预处理

数据表匹配合并
类似Excel中的VLOOKUP()函数和Power Query的合并查询功能。 其中与Power Query的“左外部、右外部、完全外部等”有类似功能,SQL表连接JOIN匹配常用的模式有INNER JOIN,LEFT JOIN,和RIGHT JOIN。

  1. /* 为便于理解 */
  2. # 内连接
  3. SELECT 学员信息表.*, 学员成绩表.*
  4. FROM 学员信息表 INNER JOIN 学员成绩表 ON 学员信息表.学号 = 学员成绩表.学号;
  5. # 左连接
  6. SELECT 学员信息表.*, 学员成绩表.*
  7. FROM 学员信息表 LEFT JOIN 学员成绩表 ON 学员信息表.学号 = 学员成绩表.学号;
  8. # 右连接
  9. SELECT 学员信息表.*, 学员成绩表.*
  10. FROM 学员信息表 RIGHT JOIN 学员成绩表 ON 学员信息表.学号 = 学员成绩表.学号;

排序

  1. # 按price 升序排列
  2. SELECT * FROM table_name ORDER BY price;
  3. # 按price 降序排列
  4. SELECT * FROM table_name ORDER BY price DESC;

数据分组

  1. /* 为了便于理解,贴一个我曾经的写过的语句 */
  2. -- 通过日期按星期分类汇总
  3. SELECT `区域城市`,`宴会编码`,`跟单员`,`日期`,
  4. CASE DAYNAME(日期)
  5. WHEN 'Monday' THEN '星期一'
  6. WHEN 'Tuesday' THEN '星期二'
  7. WHEN 'Wednesday' THEN '星期三'
  8. WHEN 'Thursday' THEN '星期四'
  9. WHEN 'Friday' THEN '星期五'
  10. WHEN 'Saturday' THEN '星期六'
  11. WHEN 'Sunday' THEN '星期日'
  12. ELSE '其他' END
  13. as 星期,时间,`桌数`,`购酒数`,`宴会类型`
  14. FROM `宴会基础信息表`
  15. -- 直接分组查询并汇总
  16. SELECT COUNT(id) AS id_count, SUM(price)AS total_price,
  17. CASE
  18. WHEN age<30 THEN 'A'
  19. WHEN age>=30 AND age<50 THEN 'B'
  20. WHEN age>=50 THEN 'C'
  21. ELSE 'D' END AS age_type
  22. FROM table_name GROUP BY age_type ORDER BY id_count;

数据分列

  1. # category依据 “-” 分列
  2. SELECT SUBSTRING_INDEX(category,'-',1)AS size,SUBSTRING_INDEX(category,'-',-1)AS colour
  3. FROM table_name;
  4. # 按分列后的结果进行单列数据汇总
  5. SELECT SUBSTRING_INDEX(category,'-',1) AS size,COUNT(id)
  6. FROM table_name GROUP BY size;
  7. # 按分列后的结果进行多列数据汇总
  8. SELECT
  9. SUBSTRING_INDEX(category,'-',1) AS size,
  10. COUNT(id) AS id_count,ROUND(SUM(price),2) AS total_price
  11. FROM table_name GROUP BY size;
  12. # 更新分列后的字段内容
  13. UPDATE table_name SET size = SUBSTRING_INDEX(category,'-',1),colour =SUBSTRING_INDEX(category,'-',-1);

数据提取

  1. -- 按列提取
  2. SELECT city FROM table_name;
  3. -- 按行提取
  4. SELECT * FROM table_name WHERE city='成都';
  5. -- 按行提取
  6. # 提取2到5行(4行)
  7. SELECT * FROM table_name LIMIT 2,5;
  8. -- 按条件提取并计算
  9. SELECT AVG(price) FROM table_name WHERE city='成都' AND price < 19.88;
  10. -- JSON中提取
  11. SELECT JSON_EXTRACT('{"name":"沐之杰","age":"13240133388"}',"$.name");

复杂提取

  1. -- 38°珍藏级剑南春(名烟酒)[500ml]
  2. # 假设table_name表中的col_name列有这样一个值:38°珍藏级剑南春(名烟酒)[500ml],需要分别提取38°、珍藏级剑南春、名烟酒、500ml
  3. SELECT
  4. SUBSTRING_INDEX(col_name,'°',1) as 度数, -- 38
  5. SUBSTRING_INDEX(SUBSTRING_INDEX(col_name,'(',1),'°',-1) as 品名, -- 珍藏级剑南春
  6. SUBSTRING_INDEX(col_name,')',-1) as 规格, -- [500ml]
  7. SUBSTRING_INDEX(SUBSTRING_INDEX(col_name,"(",-1),')',1) as 版本, -- 名烟酒
  8. SUBSTRING_INDEX(SUBSTRING_INDEX(col_name,"[",-1),']',1) as 容量 -- 500ml
  9. FROM table_name

数据筛选

按条件筛选需要用到算术运算、逻辑运算符以及LIKE通配符等。

  1. # 筛选city不等于成都的
  2. SELECT * FROM table_name WHERE city !='成都';
  3. # 筛选name(模糊筛选)中以“王”开头的
  4. SELECT * FROM table_name WHERE city LIKE '王%';
  5. #筛选后计数
  6. SELECT COUNT(id) AS id_count FROM table_name WHERE city='成都'AND price < 19.88;

统计汇总

使用分类汇总可以按特定维度对数据进行统计分析,mysql中使用的主要函数是 GROUP BY、CASE WHEN以及GROUP_CONCAT。

  1. /* 来个复杂点儿的 */
  2. # 查询不同尺码下的不同颜色的产品销售金额
  3. SELECT sizenote,colornote,goodsid,sum(goodsprice * amount) AS 销售额
  4. FROM orderdetail LEFT JOIN goodssize ON orderdetail.sizeid = goodssize.sizeid
  5. LEFT JOIN goodscolor ON orderdetail.colorid = goodscolor.colorid
  6. GROUP BY sizenote, colornote, goodsid
  7. ORDER BY sizenote, colornote, 销售额 DESC;
  8. # 查询销售大区对应哪些办事处
  9. SELECT `销售大区`,GROUP_CONCAT(`办事处`) FROM `2018年某牌销售表` GROUP BY `销售大区`;

按条件统计

  1. select count(if(mt=0,1,null)) as a,count(if(mt>0,1,null)) as b from table
  2. select sum(mt=0) as a,sum(mt>0) as b from table

多表连接

  1. -- 3表右连接,空值处理
  2. SELECT * FROM (
  3. SELECT * FROM (
  4. SELECT t1.ymd,t1.skbs,t2.zdqy,SUM(t1.skbs + IFNULL(t2.zdqy,0)) AS '总计' FROM
  5. (
  6. (SELECT DATE_FORMAT(submit_time,'%Y-%m-%d')AS ymd, count(*) AS 'skbs'
  7. from skbs_risk_entry WHERE remark LIKE '%陕西%' GROUP BY ymd ORDER BY ymd) AS t1
  8. LEFT JOIN
  9. (SELECT DATE_FORMAT(submit_time,'%Y-%m-%d')AS ymd, count(*) AS 'zdqy'
  10. from zdqy_risk_entry WHERE remark LIKE '%西安%' GROUP BY ymd ORDER BY ymd) AS t2
  11. ON t1.ymd = t2.ymd
  12. ) GROUP BY t1.ymd,t1.skbs,t2.zdqy) AS t11
  13. LEFT JOIN
  14. (SELECT DATE_FORMAT(submit_time,'%Y-%m-%d') AS ymdsm, count(*) AS 'sm'
  15. from sm_risk_entry WHERE remark LIKE '%西安%' GROUP BY ymdsm) AS t12
  16. ON t11.ymd = t12.ymdsm
  17. ) AS t23

数据转换

日期格式

  1. /* Mysql */
  2. SELECT DATE_FORMAT('2021-01-01','%Y-%m-%d %H:%i:%s') --2021-01-01 00:00:00
  3. /* Postgresql */
  4. SELECT to_timestamp('2012-05-01','yyyy-mm-dd hh24:mi:ss') -- 2012-05-01 00:00:00+08
  5. select date '2018-03-05';
  6. select '2018-03-05'::date;
  7. select to_date('2018-03-05', 'YYYY-MM-DD');
  8. select timestamp '2018-03-05 10:23:43';
  9. select '2018-03-05 10:23:43'::timestamp;
  10. select '2018-03-05'::timestamp;
  11. select to_timestamp('2018-03-05 13:23:45.485', 'YYYY-MM-DD HH24:MI:SS.MS');
  12. -- 处理日期格式为日期+时间
  13. UPDATE scjdgl_ztjbxx_gx SET create_time = create_time::timestamp
  14. SELECT to_date('202201','YYYYMMDD'); -- 2022-01-01