sql - 오라클 - 외부 조인




비 equijoin 논리를 사용하지 않고 시계열을 통해 매일 집계 (4)

초기 질문

다음 데이터 세트가 날짜 테이블과 쌍을 이루면 :

MembershipId | ValidFromDate | ValidToDate
==========================================
0001         | 1997-01-01    | 2006-05-09
0002         | 1997-01-01    | 2017-05-12
0003         | 2005-06-02    | 2009-02-07

주어진 일 또는 몇 일간에 얼마나 많은 Memberships 이 열려 있었습니까?

초기 답변

이 질문에 답한 다음이 대답을 통해 필요한 기능을 얻을 수 here .

select d.[Date]
      ,count(m.MembershipID) as MembershipCount
from DIM.[Date] as d
    left join Memberships as m
        on(d.[Date] between m.ValidFromDateKey and m.ValidToDateKey)
where d.CalendarYear = 2016
group by d.[Date]
order by d.[Date];

비평가가 너무 오래 걸릴 때 다른 접근법이 있다는 의견 이 나왔다.

후속 조치

이와 같이, 동등한 논리가 위 쿼리의 출력을 복제하는 것처럼 보이는 것은 무엇입니까?

진행 상황

지금까지 제공된 답변에서 320 만 Membership 기록과 함께 작업하고있는 하드웨어를 능가하는 아래 내용을 제안했습니다 :

declare @s date = '20160101';
declare @e date = getdate();

with s as
(
    select d.[Date] as d
        ,count(s.MembershipID) as s
    from dbo.Dates as d
        join dbo.Memberships as s
            on d.[Date] = s.ValidFromDateKey
    group by d.[Date]
)
,e as
(
    select d.[Date] as d
        ,count(e.MembershipID) as e
    from dbo.Dates as d
        join dbo.Memberships as e
            on d.[Date] = e.ValidToDateKey
    group by d.[Date]
),c as
(
    select isnull(s.d,e.d) as d
            ,sum(isnull(s.s,0) - isnull(e.e,0)) over (order by isnull(s.d,e.d)) as c
    from s
        full join e
            on s.d = e.d
)
select d.[Date]
    ,c.c
from dbo.Dates as d
    left join c
        on d.[Date] = c.d
where d.[Date] between @s and @e
order by d.[Date]
;

그 다음으로,이 집계를 하루에 구성 그룹으로 나누기 위해 나는 다음과 같이 잘 수행하고 있습니다.

declare @s date = '20160101';
declare @e date = getdate();

with s as
(
    select d.[Date] as d
        ,s.MembershipGrouping as g
        ,count(s.MembershipID) as s
    from dbo.Dates as d
        join dbo.Memberships as s
            on d.[Date] = s.ValidFromDateKey
    group by d.[Date]
            ,s.MembershipGrouping
)
,e as
(
    select d.[Date] as d
        ,e..MembershipGrouping as g
        ,count(e.MembershipID) as e
    from dbo.Dates as d
        join dbo.Memberships as e
            on d.[Date] = e.ValidToDateKey
    group by d.[Date]
            ,e.MembershipGrouping
),c as
(
    select isnull(s.d,e.d) as d
            ,isnull(s.g,e.g) as g
            ,sum(isnull(s.s,0) - isnull(e.e,0)) over (partition by isnull(s.g,e.g) order by isnull(s.d,e.d)) as c
    from s
        full join e
            on s.d = e.d
                and s.g = e.g
)
select d.[Date]
    ,c.g
    ,c.c
from dbo.Dates as d
    left join c
        on d.[Date] = c.d
where d.[Date] between @s and @e
order by d.[Date]
        ,c.g
;

누구든지 위의 개선 할 수 있습니까?


날짜 차원에 모든 멤버십 기간에 포함 된 모든 날짜가 포함된다고 가정 할 때 다음과 같은 것을 사용할 수 있습니다.

조인은 equi 조인이므로 중첩 된 루프가 아닌 해시 조인 또는 병합 조인을 사용할 수 있습니다.이 조인 된 루프는 외부 행마다 한 번씩 내부 하위 트리를 실행합니다.

(ValidToDate) include(ValidFromDate) 에 인덱스가 있다고 가정하면 (ValidToDate) include(ValidFromDate) 역순으로 Memberships 에 대한 단일 검색과 날짜 차원에 대한 단일 검색을 사용할 수 있습니다. 아래 나와있는 시간은 320 만 명의 회원과 140 만 명의 일반 회원을 대상으로 1 년 동안 결과를 반환하는 데 1 초 미만의 시간이 걸렸습니다 ( script )

DECLARE @StartDate DATE = '2016-01-01',
        @EndDate   DATE = '2016-12-31';

WITH MD
     AS (SELECT Date,
                SUM(Adj) AS MemberDelta
         FROM   Memberships
                CROSS APPLY (VALUES ( ValidFromDate, +1),
                                    --Membership count decremented day after the ValidToDate
                                    (DATEADD(DAY, 1, ValidToDate), -1) ) V(Date, Adj)
         WHERE
          --Members already expired before the time range of interest can be ignored
          ValidToDate >= @StartDate
          AND
          --Members whose membership starts after the time range of interest can be ignored
          ValidFromDate <= @EndDate
         GROUP  BY Date),
     MC
     AS (SELECT DD.DateKey,
                SUM(MemberDelta) OVER (ORDER BY DD.DateKey ROWS UNBOUNDED PRECEDING) AS CountOfNonIgnoredMembers
         FROM   DIM_DATE DD
                LEFT JOIN MD
                  ON MD.Date = DD.DateKey)
SELECT DateKey,
       CountOfNonIgnoredMembers AS MembershipCount
FROM   MC
WHERE  DateKey BETWEEN @StartDate AND @EndDate 
ORDER BY DateKey

Demo (2016 년의 달력 연도가 예제 데이터에서는 그리 흥미롭지 않습니다.)


다음은 equijoin을 사용하여이 문제를 해결하는 방법입니다.

--data generation
declare @Membership table (MembershipId varchar(10), ValidFromDate date, ValidToDate date)
insert into @Membership values
('0001', '1997-01-01', '2006-05-09'),
('0002', '1997-01-01', '2017-05-12'),
('0003', '2005-06-02', '2009-02-07')

declare @startDate date, @endDate date
select @startDate =  MIN(ValidFromDate), @endDate = max(ValidToDate) from @Membership
--in order to use equijoin I need all days between min date and max date from Membership table (both columns)
;with cte as (
    select @startDate [date]
    union all
    select DATEADD(day, 1, [date]) from cte
    where [date] < @endDate
)
--in this query, we will assign value to each day:
--one, if project started on that day
--minus one, if project ended on that day
--then, it's enough to (cumulative) sum all this values to get how many projects were ongoing on particular day
select [date],
       sum(case when [DATE] = ValidFromDate then 1 else 0 end +
            case when [DATE] = ValidToDate then -1 else 0 end)
            over (order by [date] rows between unbounded preceding and current row)
from cte [c]
left join @Membership [m]
on [c].[date] = [m].ValidFromDate  or [c].[date] = [m].ValidToDate
option (maxrecursion 0)

다른 해결책이 있습니다.

--data generation
declare @Membership table (MembershipId varchar(10), ValidFromDate date, ValidToDate date)
insert into @Membership values
('0001', '1997-01-01', '2006-05-09'),
('0002', '1997-01-01', '2017-05-12'),
('0003', '2005-06-02', '2009-02-07')

;with cte as (
    select CAST('2016-01-01' as date) [date]
    union all
    select DATEADD(day, 1, [date]) from cte
    where [date] < '2016-12-31'
)

select [date],
       (select COUNT(*) from @Membership where ValidFromDate < [date]) - 
       (select COUNT(*) from @Membership where ValidToDate < [date]) [ongoing]
from cte
option (maxrecursion 0)

주의를 기울이십시오, 나는 현재 쿼리가 잘못된 결과를 반환한다고 말하면 @PittsburghDBA가 옳다고 생각합니다.
회원 자격의 마지막 날은 계산되지 않으므로 최종 금액은 예상보다 낮습니다.
이 버전에서 수정했습니다.

이렇게하면 실제 진행 상황이 약간 개선됩니다.

declare @s date = '20160101';
declare @e date = getdate();

with 
x as (
    select d, sum(c) c
    from (
        select ValidFromDateKey d, count(MembershipID) c
        from Memberships
        group by ValidFromDateKey 

        union all

        -- dateadd needed to count last day of membership too!!
        select dateadd(dd, 1, ValidToDateKey) d, -count(MembershipID) c
        from Memberships
        group by ValidToDateKey 
    )x
    group by d
),
c as
(
    select d, sum(x.c) over (order by d) as c
    from x
)
select d.day, c cnt
from calendar d
left join c on d.day = c.d
where d.day between @s and @e
order by d.day;

한 가지 방법은 INNER JOIN을 사용하여 일치 항목을 찾고 COUNT ()를 사용하여 MemberCount GROUPed BY DateKey를 작성한 다음 UNION ALL을 동일한 날짜 세트로 사용하고 각 날짜의 멤버 수를 0으로 계산합니다 . 마지막 단계는이 공용체의 MemberCount와 GROUP BY DateKey를 SUM ()하는 것입니다. 요청에 따라 LEFT JOIN과 NOT EXISTS를 피할 수 있습니다. 다른 멤버가 지적했듯이, 이것은 등속 조인이 아닙니다. 왜냐하면 우리는 범위를 사용해야하기 때문에,하지만 당신이 의도 한대로 행동한다고 ​​생각합니다.

이것은 약 1 만년의 논리적 인 읽기와 함께 약 1 년 분량의 자료를 제공 할 것입니다. 콜드 캐시 (cold cache)에서 돌아가는 디스크가있는 일반 노트북에서는 1 개월에서 1 개월 (정확한 카운트 수)를 제공합니다.

다음은 무작위 기간 330 만 개의 행을 만드는 예제입니다. 하단의 쿼리는 1 개월 분량의 데이터를 반환합니다.

--Stay quiet for a moment
SET NOCOUNT ON
SET STATISTICS IO OFF
SET STATISTICS TIME OFF

--Clean up if re-running
DROP TABLE IF EXISTS DIM_DATE
DROP TABLE IF EXISTS FACT_MEMBER

--Date dimension
CREATE TABLE DIM_DATE
  (
  DateKey DATE NOT NULL 
  )

--Membership fact
CREATE TABLE FACT_MEMBER
  (
  MembershipId INT NOT NULL
  , ValidFromDateKey DATE NOT NULL
  , ValidToDateKey DATE NOT NULL
  )

--Populate Date dimension from 2001 through end of 2018
DECLARE @startDate DATE = '2001-01-01'
DECLARE @endDate DATE = '2018-12-31'
;WITH CTE_DATE AS
(
SELECT @startDate AS DateKey
UNION ALL
SELECT
       DATEADD(DAY, 1, DateKey)
FROM
       CTE_DATE AS D
WHERE
       D.DateKey < @endDate
)
INSERT INTO
  DIM_DATE
  (
  DateKey
  )
SELECT
  D.DateKey
FROM
  CTE_DATE AS D
OPTION (MAXRECURSION 32767)

--Populate Membership fact with members having a random membership length from 1 to 36 months 
;WITH CTE_DATE AS
(
SELECT @startDate AS DateKey
UNION ALL
SELECT
       DATEADD(DAY, 1, DateKey)
FROM
       CTE_DATE AS D
WHERE
       D.DateKey < @endDate
)
,CTE_MEMBER AS
(
SELECT 1 AS MembershipId
UNION ALL
SELECT MembershipId + 1 FROM CTE_MEMBER WHERE MembershipId < 500
)
,
CTE_MEMBERSHIP
AS
(
SELECT
  ROW_NUMBER() OVER (ORDER BY NEWID()) AS MembershipId
  , D.DateKey AS ValidFromDateKey
FROM
  CTE_DATE AS D
  CROSS JOIN CTE_MEMBER AS M
)
INSERT INTO
    FACT_MEMBER
    (
    MembershipId
    , ValidFromDateKey
    , ValidToDateKey
    )
SELECT
    M.MembershipId
    , M.ValidFromDateKey
      , DATEADD(MONTH, FLOOR(RAND(CHECKSUM(NEWID())) * (36-1)+1), M.ValidFromDateKey) AS ValidToDateKey
FROM
    CTE_MEMBERSHIP AS M
OPTION (MAXRECURSION 32767)

--Add clustered Primary Key to Date dimension
ALTER TABLE DIM_DATE ADD CONSTRAINT PK_DATE PRIMARY KEY CLUSTERED
    (
    DateKey ASC
    )

--Index
--(Optimize in your spare time)
DROP INDEX IF EXISTS SK_FACT_MEMBER ON FACT_MEMBER
CREATE CLUSTERED INDEX SK_FACT_MEMBER ON FACT_MEMBER
    (
    ValidFromDateKey ASC
    , ValidToDateKey ASC
    , MembershipId ASC
    )


RETURN

--Start test
--Emit stats
SET STATISTICS IO ON
SET STATISTICS TIME ON

--Establish range of dates
DECLARE
  @rangeStartDate DATE = '2010-01-01'
  , @rangeEndDate DATE = '2010-01-31'

--UNION the count of members for a specific date range with the "zero" set for the same range, and SUM() the counts
;WITH CTE_MEMBER
AS
(
SELECT
    D.DateKey
    , COUNT(*) AS MembershipCount
FROM
    DIM_DATE AS D
    INNER JOIN FACT_MEMBER AS M ON
        M.ValidFromDateKey <= @rangeEndDate
        AND M.ValidToDateKey >= @rangeStartDate
        AND D.DateKey BETWEEN M.ValidFromDateKey AND M.ValidToDateKey
WHERE
    D.DateKey BETWEEN @rangeStartDate AND @rangeEndDate
GROUP BY
    D.DateKey

UNION ALL

SELECT
    D.DateKey
    , 0 AS MembershipCount
FROM
    DIM_DATE AS D
WHERE
    D.DateKey BETWEEN @rangeStartDate AND @rangeEndDate
)
SELECT
    M.DateKey
    , SUM(M.MembershipCount) AS MembershipCount
FROM
    CTE_MEMBER AS M
GROUP BY
    M.DateKey
ORDER BY
    M.DateKey ASC
OPTION (RECOMPILE, MAXDOP 1)






join