MAPREDUCE服务 MRS-窗口函数:Value Functions

时间:2024-04-10 09:06:15

Value Functions

通常情况下,要重视null值。如果指定了IGNORE NULLS,那么计算中所有包含x为null值的行都会被排除掉,如果所有行的x字段值都是null值,将会返回默认值,否则返回null值。

-- 数据准备
create table cookie_views( cookieid varchar,createtime timestamp,url varchar);
insert into cookie_views values
('cookie1',timestamp '2020-07-10 10:00:02','url20'),
('cookie1',timestamp '2020-07-10 10:00:00','url10'),
('cookie1',timestamp '2020-07-10 10:03:04','urll3'),
('cookie1',timestamp '2020-07-10 10:50:05','url60'),
('cookie1',timestamp '2020-07-10 11:00:00','url70'),
('cookie1',timestamp '2020-07-10 10:10:00','url40'),
('cookie1',timestamp '2020-07-10 10:50:01','url50'),
('cookie2',timestamp '2020-07-10 10:00:02','url23'),
('cookie2',timestamp '2020-07-10 10:00:00','url11'),
('cookie2',timestamp '2020-07-10 10:03:04','url33'),
('cookie2',timestamp '2020-07-10 10:50:05','url66'),
('cookie2',timestamp '2020-07-10 11:00:00','url77'),
('cookie2',timestamp '2020-07-10 10:10:00','url47'),
('cookie2',timestamp '2020-07-10 10:50:01','url55');
  • first_value(x)→ [same as input]

    描述:返回窗口的第一个值。

    SELECT cookieid,
    createtime,
    url,
    ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
    FIRST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS first1 
    FROM cookie_views;
    cookieid |       createtime        |  url  | rn | first1 
    ----------|-------------------------|-------|----|--------
     cookie1  | 2020-07-10 10:00:00.000 | url10 |  1 | url10  
     cookie1  | 2020-07-10 10:00:02.000 | url20 |  2 | url10  
     cookie1  | 2020-07-10 10:03:04.000 | urll3 |  3 | url10  
     cookie1  | 2020-07-10 10:10:00.000 | url40 |  4 | url10  
     cookie1  | 2020-07-10 10:50:01.000 | url50 |  5 | url10  
     cookie1  | 2020-07-10 10:50:05.000 | url60 |  6 | url10  
     cookie1  | 2020-07-10 11:00:00.000 | url70 |  7 | url10  
     cookie2  | 2020-07-10 10:00:00.000 | url11 |  1 | url11  
     cookie2  | 2020-07-10 10:00:02.000 | url23 |  2 | url11  
     cookie2  | 2020-07-10 10:03:04.000 | url33 |  3 | url11  
     cookie2  | 2020-07-10 10:10:00.000 | url47 |  4 | url11  
     cookie2  | 2020-07-10 10:50:01.000 | url55 |  5 | url11  
     cookie2  | 2020-07-10 10:50:05.000 | url66 |  6 | url11  
     cookie2  | 2020-07-10 11:00:00.000 | url77 |  7 | url11  
    (14 rows)
  • last_value(x)→ [same as input]

    描述:返回窗口的最后一个值。

    SELECT cookieid,createtime,url,
    ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
    LAST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS last1 
    FROM cookie_views;
     cookieid |       createtime        |  url  | rn | last1 
    ----------|-------------------------|-------|----|-------
     cookie2  | 2020-07-10 10:00:00.000 | url11 |  1 | url11 
     cookie2  | 2020-07-10 10:00:02.000 | url23 |  2 | url23 
     cookie2  | 2020-07-10 10:03:04.000 | url33 |  3 | url33 
     cookie2  | 2020-07-10 10:10:00.000 | url47 |  4 | url47 
     cookie2  | 2020-07-10 10:50:01.000 | url55 |  5 | url55 
     cookie2  | 2020-07-10 10:50:05.000 | url66 |  6 | url66 
     cookie2  | 2020-07-10 11:00:00.000 | url77 |  7 | url77 
     cookie1  | 2020-07-10 10:00:00.000 | url10 |  1 | url10 
     cookie1  | 2020-07-10 10:00:02.000 | url20 |  2 | url20 
     cookie1  | 2020-07-10 10:03:04.000 | urll3 |  3 | urll3 
     cookie1  | 2020-07-10 10:10:00.000 | url40 |  4 | url40 
     cookie1  | 2020-07-10 10:50:01.000 | url50 |  5 | url50 
     cookie1  | 2020-07-10 10:50:05.000 | url60 |  6 | url60 
     cookie1  | 2020-07-10 11:00:00.000 | url70 |  7 | url70 
    (14 rows)
  • nth_value(x, offset)→ [same as input]

    描述:返回距窗口开头指定偏移量的值。偏移量从1开始。偏移量可以是任何标量表达式。如果偏移量为null或大于窗口中的值数,则返回null。偏移量不允许为0或者负数。

    SELECT cookieid,createtime,url,
    ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
    NTH_VALUE(url,3) OVER(PARTITION BY cookieid ORDER BY createtime) AS last1 
    FROM cookie_views;
    cookieid |       createtime        |  url  | rn | last1 
    ----------|-------------------------|-------|----|-------
     cookie1  | 2020-07-10 10:00:00.000 | url10 |  1 | NULL  
     cookie1  | 2020-07-10 10:00:02.000 | url20 |  2 | NULL  
     cookie1  | 2020-07-10 10:03:04.000 | urll3 |  3 | urll3 
     cookie1  | 2020-07-10 10:10:00.000 | url40 |  4 | urll3 
     cookie1  | 2020-07-10 10:50:01.000 | url50 |  5 | urll3 
     cookie1  | 2020-07-10 10:50:05.000 | url60 |  6 | urll3 
     cookie1  | 2020-07-10 11:00:00.000 | url70 |  7 | urll3 
     cookie2  | 2020-07-10 10:00:00.000 | url11 |  1 | NULL  
     cookie2  | 2020-07-10 10:00:02.000 | url23 |  2 | NULL  
     cookie2  | 2020-07-10 10:03:04.000 | url33 |  3 | url33 
     cookie2  | 2020-07-10 10:10:00.000 | url47 |  4 | url33 
     cookie2  | 2020-07-10 10:50:01.000 | url55 |  5 | url33 
     cookie2  | 2020-07-10 10:50:05.000 | url66 |  6 | url33 
     cookie2  | 2020-07-10 11:00:00.000 | url77 |  7 | url33 
    (14 rows)
  • lead(x[, offset[, default_value]])→ [same as input]

    描述:返回窗口分区中当前行之后的偏移行处的值。偏移量从0开始,即当前行。偏移量可以是任何标量表达式。默认偏移量为1。如果偏移量为null,则返回null。如果偏移量指向不在分区内的行,则返回default_value,或者如果未指定,则返回null。lead()函数要求指定窗口顺序。不得指定窗框。

    SELECT cookieid,createtime,url,
    ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
    LEAD(createtime,1,timestamp '2020-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS next_1_time,
    LEAD(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS next_2_time 
    FROM cookie_views;
     cookieid |       createtime        |  url  | rn |       next_1_time       |       next_2_time       
    ----------|-------------------------|-------|----|-------------------------|-------------------------
     cookie2  | 2020-07-10 10:00:00.000 | url11 |  1 | 2020-07-10 10:00:02.000 | 2020-07-10 10:03:04.000 
     cookie2  | 2020-07-10 10:00:02.000 | url23 |  2 | 2020-07-10 10:03:04.000 | 2020-07-10 10:10:00.000 
     cookie2  | 2020-07-10 10:03:04.000 | url33 |  3 | 2020-07-10 10:10:00.000 | 2020-07-10 10:50:01.000 
     cookie2  | 2020-07-10 10:10:00.000 | url47 |  4 | 2020-07-10 10:50:01.000 | 2020-07-10 10:50:05.000 
     cookie2  | 2020-07-10 10:50:01.000 | url55 |  5 | 2020-07-10 10:50:05.000 | 2020-07-10 11:00:00.000 
     cookie2  | 2020-07-10 10:50:05.000 | url66 |  6 | 2020-07-10 11:00:00.000 | NULL                    
     cookie2  | 2020-07-10 11:00:00.000 | url77 |  7 | 2020-01-01 00:00:00.000 | NULL                    
     cookie1  | 2020-07-10 10:00:00.000 | url10 |  1 | 2020-07-10 10:00:02.000 | 2020-07-10 10:03:04.000 
     cookie1  | 2020-07-10 10:00:02.000 | url20 |  2 | 2020-07-10 10:03:04.000 | 2020-07-10 10:10:00.000 
     cookie1  | 2020-07-10 10:03:04.000 | urll3 |  3 | 2020-07-10 10:10:00.000 | 2020-07-10 10:50:01.000 
     cookie1  | 2020-07-10 10:10:00.000 | url40 |  4 | 2020-07-10 10:50:01.000 | 2020-07-10 10:50:05.000 
     cookie1  | 2020-07-10 10:50:01.000 | url50 |  5 | 2020-07-10 10:50:05.000 | 2020-07-10 11:00:00.000 
     cookie1  | 2020-07-10 10:50:05.000 | url60 |  6 | 2020-07-10 11:00:00.000 | NULL                    
     cookie1  | 2020-07-10 11:00:00.000 | url70 |  7 | 2020-01-01 00:00:00.000 | NULL                    
    (14 rows)
  • lag(x[, offset[, default_value]])→ [same as input]
    描述:返回窗口分区中当前行之前的偏移行的值,偏移量从0开始,即当前行,偏移量可以是任何标量表达式,默认偏移量为1。如果偏移量为null,则返回null。如果偏移量指向不在分区内的行,则返回default_value。如果未指定,则返回null。lag()函数要求指定窗口顺序,不得指定窗框。
     SELECT cookieid, createtime, url, ROW_NUMBER() OVER(PARTITION BY cookieid  ORDER BY createtime) AS rn, 
       LAG(createtime,1, timestamp '2020-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime)  AS last_1_time, 
       LAG(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime)  AS last_2_time 
        FROM cookie_views;
    
    cookieid |       createtime         |  url  | rn |       last_1_time       |       last_2_time       
    ----------|-------------------------|-------|----|-------------------------|-----------------------
     cookie2  | 2020-07-10 10:00:00.000 | url11 |  1 | 2020-01-01 00:00:00.000 | NULL                    
     cookie2  | 2020-07-10 10:00:02.000 | url23 |  2 | 2020-07-10 10:00:00.000 | NULL                    
     cookie2  | 2020-07-10 10:03:04.000 | url33 |  3 | 2020-07-10 10:00:02.000 | 2020-07-10 10:00:00.000 
     cookie2  | 2020-07-10 10:10:00.000 | url47 |  4 | 2020-07-10 10:03:04.000 | 2020-07-10 10:00:02.000 
     cookie2  | 2020-07-10 10:50:01.000 | url55 |  5 | 2020-07-10 10:10:00.000 | 2020-07-10 10:03:04.000 
     cookie2  | 2020-07-10 10:50:05.000 | url66 |  6 | 2020-07-10 10:50:01.000 | 2020-07-10 10:10:00.000 
     cookie2  | 2020-07-10 11:00:00.000 | url77 |  7 | 2020-07-10 10:50:05.000 | 2020-07-10 10:50:01.000 
     cookie1  | 2020-07-10 10:00:00.000 | url10 |  1 | 2020-01-01 00:00:00.000 | NULL                    
     cookie1  | 2020-07-10 10:00:02.000 | url20 |  2 | 2020-07-10 10:00:00.000 | NULL                    
     cookie1  | 2020-07-10 10:03:04.000 | urll3 |  3 | 2020-07-10 10:00:02.000 | 2020-07-10 10:00:00.000 
     cookie1  | 2020-07-10 10:10:00.000 | url40 |  4 | 2020-07-10 10:03:04.000 | 2020-07-10 10:00:02.000 
     cookie1  | 2020-07-10 10:50:01.000 | url50 |  5 | 2020-07-10 10:10:00.000 | 2020-07-10 10:03:04.000 
     cookie1  | 2020-07-10 10:50:05.000 | url60 |  6 | 2020-07-10 10:50:01.000 | 2020-07-10 10:10:00.000 
     cookie1  | 2020-07-10 11:00:00.000 | url70 |  7 | 2020-07-10 10:50:05.000 | 2020-07-10 10:50:01.000 
    (14 rows)
support.huaweicloud.com/cmpntguide-lts-mrs/mrs_01_2499112.html