hive窗口函数(一)

基础函数:SUM,AVG,MIN,MAX

数据准备

创建数据表

1
2
3
4
5
6
CREATE EXTERNAL TABLE supertest(
cookieid string,
createtime string, --day
pv INT
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',' ;

插入数据

1
2
3
4
5
6
INSERT INTO supertest(cookieid,createtime,pv) VALUES('cookie1','2019-07-10',1)
INSERT INTO supertest(cookieid,createtime,pv) VALUES('cookie1','2019-07-11',2)
INSERT INTO supertest(cookieid,createtime,pv) VALUES('cookie1','2019-07-12',3)
INSERT INTO supertest(cookieid,createtime,pv) VALUES('cookie1','2019-07-13',4)
INSERT INTO supertest(cookieid,createtime,pv) VALUES('cookie1','2019-07-14',5)
INSERT INTO supertest(cookieid,createtime,pv) VALUES('cookie1','2019-07-15',4)

数据如下:

1
2
3
4
5
6
7
select * from supertest;
cookie1 2019-07-10 1
cookie1 2019-07-15 4
cookie1 2019-07-12 3
cookie1 2019-07-13 4
cookie1 2019-07-14 5
cookie1 2019-07-11 2

基础函数:SUM,AVG,MIN,MAX

SUM求和

1
2
3
4
5
6
7
8
9
10
11
12
SELECT cookieid,createtime,pv,
-- 默认为从起点到当前行
sum(pv) OVER(PARTITION BY cookieid ORDER BY createtime) as pv1,
-- 组内所有行
sum(pv) OVER(PARTITION BY cookieid) as pv2,
--当前行+往前3行+往后1行
sum(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv3,
--当前行+往后所有行
sum(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv4,
--从起点到当前行,结果同pv1
sum(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv5
from supertest

结果为:

1
2
3
4
5
6
7
   cookieid	createtime	pv	pv1	pv2	pv3	pv4	pv5
1 cookie1 2019-07-10 1 1 19 3 19 1
2 cookie1 2019-07-11 2 3 19 6 18 3
3 cookie1 2019-07-12 3 6 19 10 16 6
4 cookie1 2019-07-13 4 10 19 15 13 10
5 cookie1 2019-07-14 5 15 19 18 9 15
6 cookie1 2019-07-15 4 19 19 16 4 19

pv1:分组内从起点到当前行的和
pv2:同组内所有pv和
pv3:当前行往前三行至往后一行的和
pv4:当前行到最后一行的和
pv5:同pv1

若不指定ROWS BETWEEN,默认为从起点到当前行
若不指定ORDER BY,则将分组内所有值叠加
** WINDOW字句 **:
PRECEDING:往前
FOLLOWING:往后
CURRENT ROW:当前行
UNBOUNDED:起点,UNBOUNDED PRECEDING 表示从前面的起点, UNBOUNDED FOLLOWING:表示到后面的终点

以下用法和sum大同小异,不再多解释

AVG

1
2
3
4
5
6
7
8
9
10
11
12
SELECT cookieid,createtime,pv,
-- 默认为从起点到当前行
avg(pv) OVER(PARTITION BY cookieid ORDER BY createtime) as pv1,
-- 组内所有行
avg(pv) OVER(PARTITION BY cookieid) as pv2,
--当前行+往前3行+往后1行
avg(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv3,
--当前行+往后所有行
avg(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv4,
--从起点到当前行,结果同pv1
avg(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv5
from supertest

结果:

1
2
3
4
5
6
7
   cookieid	createtime	pv	pv1	pv2	pv3	pv4	pv5
1 cookie1 2019-07-10 1 1 3.1666666666666665 1.5 3.1666666666666665 1
2 cookie1 2019-07-11 2 1.5 3.1666666666666665 2 3.6 1.5
3 cookie1 2019-07-12 3 2 3.1666666666666665 2.5 4 2
4 cookie1 2019-07-13 4 2.5 3.1666666666666665 3 4.333333333333333 2.5
5 cookie1 2019-07-14 5 3 3.1666666666666665 3.6 4.5 3
6 cookie1 2019-07-15 4 3.1666666666666665 3.1666666666666665 4 4 3.1666666666666665

MIN

1
2
3
4
5
6
7
8
9
10
11
12
SELECT cookieid,createtime,pv,
-- 默认为从起点到当前行
min(pv) OVER(PARTITION BY cookieid ORDER BY createtime) as pv1,
-- 组内所有行
min(pv) OVER(PARTITION BY cookieid) as pv2,
--当前行+往前3行+往后1行
min(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv3,
--当前行+往后所有行
min(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv4,
--从起点到当前行,结果同pv1
min(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv5
from supertest

MAX

1
2
3
4
5
6
7
8
9
10
11
12
SELECT cookieid,createtime,pv,
-- 默认为从起点到当前行
max(pv) OVER(PARTITION BY cookieid ORDER BY createtime) as pv1,
-- 组内所有行
max(pv) OVER(PARTITION BY cookieid) as pv2,
--当前行+往前3行+往后1行
max(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv3,
--当前行+往后所有行
max(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv4,
--从起点到当前行,结果同pv1
max(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv5
from supertest
0%