dep.txt
10 ACCOUNTING 1700
20 RESEARCH 1800
30 SALES 1900
40 OPERATIONS 1700
emp.txt
7369 SMITH CLERK 7902 1980-12-17 800.00 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600.00 300.00 30
7521 WARD SALESMAN 7698 1981-2-22 1250.00 500.00 30
7566 JONES MANAGER 7839 1981-4-2 2975.00 20
7654 MARTIN SALESMAN 7698 1981-9-28 1250.00 1400.00 30
7698 BLAKE MANAGER 7839 1981-5-1 2850.00 30
7782 CLARK MANAGER 7839 1981-6-9 2450.00 10
7788 SCOTT ANALYST 7566 1987-4-19 3000.00 20
7839 KING PRESIDENT 1981-11-17 5000.00 10
7844 TURNER SALESMAN 7698 1981-9-8 1500.00 0.00 30
7876 ADAMS CLERK 7788 1987-5-23 1100.00 20
7900 JAMES CLERK 7698 1981-12-3 950.00 30
7902 FORD ANALYST 7566 1981-12-3 3000.00 20
7934 MILLER CLERK 7782 1982-1-23 1300.00 10
查询搜索
hive (default)> select *from dept;
OK
dept.deptno dept.dname dept.loc
10 ACCOUNTING 1700
20 RESEARCH 1800
30 SALES 1900
40 OPERATIONS 1700
Time taken: 0.214 seconds, Fetched: 4 row(s)
创建部门表
create table if not exists dept(
deptno int,
dname string,
loc int
)
row format delimited fields terminated by '\t';
创建员工表
create table if not exists emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int)
row format delimited fields terminated by '\t';
加载数据
hive (default)> load data local inpath '/opt/module/data/dept.txt' into table dept;
Loading data to table default.dept
Table default.dept stats: [numFiles=1, totalSize=71]
OK
Time taken: 1.038 seconds
hive (default)> load data local inpath '/opt/module/data/emp.txt' into table emp;
Loading data to table default.emp
Table default.emp stats: [numFiles=1, totalSize=656]
OK
Time taken: 0.318 seconds
设置别名
hive (default)> select ename as name,deptno dn from emp;
OK
name dn
SMITH 20
ALLEN 30
WARD 30
JONES 20
MARTIN 30
BLAKE 30
CLARK 10
SCOTT 20
KING 10
TURNER 30
ADAMS 20
JAMES 30
FORD 20
MILLER 10
Time taken: 0.101 seconds, Fetched: 14 row(s)
总数量
hive (default)> select count(*) cnt from emp;
cnt
14
平均值
hive (default)> select avg(sal) avg_sal from emp;
avg_sal
2073.214285714286
工资总和
hive (default)> select sum(sal) sum_sal from emp;
OK
sum_sal
29025.0
工资的最大值(max)
select sum(sal) sum_sal from emp;
max_sal
5000.0
工资的最小值(min)
select min(sal) min_sal from emp;
min_sal
800.0
limit,LIMIT子句用于限制返回的行数。
hive (default)> select * from emp limit 3;
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 30
Time taken: 0.079 seconds, Fetched: 3 row(s)
where
hive (default)> select * from emp where sal >1000;
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 30
7566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 20
7654 MARTIN SALESMAN 7698 1981-9-28 1250.0 1400.0 30
7698 BLAKE MANAGER 7839 1981-5-1 2850.0 NULL 30
7782 CLARK MANAGER 7839 1981-6-9 2450.0 NULL 10
7788 SCOTT ANALYST 7566 1987-4-19 3000.0 NULL 20
7839 KING PRESIDENT NULL 1981-11-17 5000.0 NULL 10
7844 TURNER SALESMAN 7698 1981-9-8 1500.0 0.0 30
7876 ADAMS CLERK 7788 1987-5-23 1100.0 NULL 20
7902 FORD ANALYST 7566 1981-12-3 3000.0 NULL 20
7934 MILLER CLERK 7782 1982-1-23 1300.0 NULL 10
Time taken: 0.137 seconds, Fetched: 12 row(s)
查询工资是1500或5000的员工信息
hive (default)> select * from emp where sal IN (3000, 5000);
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7788 SCOTT ANALYST 7566 1987-4-19 3000.0 NULL 20
7839 KING PRESIDENT NULL 1981-11-17 5000.0 NULL 10
7902 FORD ANALYST 7566 1981-12-3 3000.0 NULL 20
Time taken: 0.096 seconds, Fetched: 3 row(s)
RLIKE子句是Hive中这个功能的一个扩展,其可以通过Java的正则表达式这个更强大的语言来指定匹配条件。
hive (default)> select ename, sal from emp where sal RLIKE '[2]';
OK
ename sal
WARD 1250.0
JONES 2975.0
MARTIN 1250.0
BLAKE 2850.0
CLARK 2450.0
Time taken: 0.07 seconds, Fetched: 5 row(s)
GROUP BY语句通常会和聚合函数一起使用,按照一个或者多个列队结果进行分组,然后对每个组执行聚合操作。
hive (default)> select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno;
t.deptno avg_sal
10 2916.6666666666665
20 2175.0
30 1566.666666666666
select t.deptno, t.job, max(t.sal) max_sal from emp t group by t.deptno, t.job;
t.deptno t.job max_sal
10 CLERK 1300.0
10 MANAGER 2450.0
10 PRESIDENT 5000.0
20 ANALYST 3000.0
20 CLERK 1100.0
20 MANAGER 2975.0
30 CLERK 950.0
30 MANAGER 2850.0
30 SALESMAN 1600.
having与where不同点
(1)where后面不能写分组函数,而having后面可以使用分组函数。
(2)having只用于group by分组统计语句。
每个部门的平均薪水大于2000的部门
select deptno, avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;