(Day 328) Week 2 - Fact Data Modeling

Ivan Ivanov · November 24, 2024

data-eng theory applying-knowledge

Hello :) Today is Day 328!

A quick summary of today:

learned a tonne about fact data modelling 😆

Week 2 Fact data modelling

Today Zach Wilson released the week 2 content in one 4 hour video.

There are in total 3 lectures and 3 labs. I watched the lecture versions in the dataexpert.io platform which are from the v3 version and updated so unfortunately I cannot share my notes from them. In order to share something here, I followed along the 3 labs.

The videos were all amazing, and here is my code with some small comments from the 3 labs

Lab 1 - making a fact table

-- fact data modelling lab 1

-- check if game_id, team_id, player_id are unique
-- we find there is (almost) 2 of every record
select
    game_id, team_id, player_id, count(1)
from game_details
group by game_id, team_id, player_id
having count(1) > 1;

-- Thing to remember when doing fact data modelling
-- are the cols we are giving even useful?
-- don't put any derived values in the fact table, let the analyst do w/e they want
-- when logging we might get double data
-- so it's common to dedupe as the 1st step
insert into fct_game_details
with deduped as (
    select
        g.game_date_est,
        g.season,
        g.home_team_id,
        gd.*,
        row_number() over (partition by gd.game_id, team_id, player_id order by g.game_date_est) as row_num
    from game_details gd join games g on gd.game_id = g.game_id
)
select
    -- all these cols are a fundamental nature of the fact
    -- we might want to put all IDs first then dim them m
    game_date_est as dim_game_date,
    season as dim_season,
    team_id as dim_team_id,
    player_id as dim_player_id,
    player_name as dim_player_name,
    start_position as dim_start_position,
    team_id = home_team_id as dim_is_playing_at_home,
    coalesce(position('DNP' in comment), 0) > 0 as dim_did_not_play,
    coalesce(position('DND' in comment), 0) > 0 as dim_did_not_dress,
    coalesce(position('NWT' in comment), 0) > 0 as dim_not_with_team,
--     comment -- we can remove it if we are confident that we have parsed all options
    cast(split_part(min, ':', 1) as real)
        + cast(split_part(min, ':', 2) as real)/60 as m_minutes,
    fgm as m_fgm,
    fga as m_fga,
    fg3m as m_fg3m,
    fg3a as m_fg3a,
    ftm as m_ftm,
    fta as m_fta,
    oreb as m_oreb,
    dreb as m_dreb,
    reb as m_reb,
    ast as m_ast,
    stl as m_stl,
    blk as m_blk,
    "TO" as m_turnovers,
    pf as m_pf,
    pts as m_pts,
    plus_minus as m_plus_minus
from deduped
where row_num = 1;

create table fct_game_details (
    -- dim_ are cols that you should group by and filter on
    dim_game_date date,
    dim_season integer,
    dim_team_id integer,
    dim_player_id integer,
    dim_player_name text,
    dim_start_position text,
    dim_is_playing_at_home boolean,
    dim_did_not_play boolean,
    dim_did_not_dress boolean,
    dim_not_with_team boolean,
    -- measures -> m_
    -- m_ are cols that you should agg and to math
    m_minutes real,
    m_fgm integer,
    m_fga integer,
    m_fg3m integer,
    m_fg3a integer,
    m_ftm integer,
    m_fta integer,
    m_oreb integer,
    m_dreb integer,
    m_reb integer,
    m_ast integer,
    m_stl integer,
    m_blk integer,
    m_turnovers integer,
    m_pf integer,
    m_pts integer,
    m_plus_minus integer,
    -- the PK helps create indexes which is good if we do filtering on cols in the PK
    primary key (dim_game_date, dim_team_id, dim_player_id)
);

-- we may have lost the team info but we can easily bring them in
select t.*, gd.*
from fct_game_details gd join teams t
on t.team_id = gd.dim_team_id;

-- lets find the players that bailed out on the most games
-- fact data modelling is about making tables that are easy to query to create cool things
select dim_player_name,
       count(1) as num_games,
       count(case when dim_not_with_team then 1 end) as bailed_num,
       cast(count(case when dim_not_with_team then 1 end) as real)/count(1) as bail_pct
from fct_game_details
group by 1
order by 4 desc;

Lab 2 - Learning about DateList 🤯

-- week 2 lab 2

create table users_cumulated (
    user_id text,
    -- lists of dates in the past where the user was active
    dates_active date[],
    -- current date for the user
    date date,
    primary key (user_id, date)
);

insert into users_cumulated
with yesterday as (
    select *
    from users_cumulated
    where date = date('2023-01-30')
), today as (
    select
        cast(user_id as text) as user_id,
        date(cast(event_time as timestamp)) as date_active
    from events
    where
        date(cast(event_time as timestamp)) = date('2023-01-31')
        and user_id is not null -- deal with null user_ids in this data
    group by user_id, date(cast(event_time as timestamp))
)
select
    coalesce(t.user_id, y.user_id) as user_id,
    case
        when y.dates_active is null then array[t.date_active]
        when t.date_active is null then y.dates_active -- we don't want to keep adding a big array of nulls
        else array[t.date_active] || y.dates_active
    end as dates_active,
    -- today's date_active might not be date if the user doesn't exist yet
    -- so we add 1 to yesterday's
    coalesce(t.date_active, y.date + interval '1 day') as date
from today t full outer join yesterday y
on t.user_id = y.user_id;


-- generate a datelist for 30 days
with users as (
    select * from users_cumulated
    where date = date('2023-01-31')
),
    series as (
        select * from generate_series(date('2023-01-01'), date('2023-01-31'), interval '1 day') as series_date
    ),
    place_holder_ints as (
        select
            case
                when dates_active @> array[date(series_date)]
                    -- date - series_date is # of days b/e current date and series date
-- if we cast a power of 2 number as bits and turn it into binary
-- then we can get a history of 1s and 0s active/inactive
                    then cast(pow(2, 32 - (date - date(series_date))) as bigint)
                    else 0
                end as placeholder_int_value,
            *
        from users cross join series -- we got the 31 days for each user
    )
select
    user_id,
    -- these are extremely efficient operations
    -- bit_count() can give us how many times the user is active
    bit_count(cast(cast(sum(placeholder_int_value) as bigint) as bit(32))) > 0 as dim_is_monthly_active,
    -- let's check a user is active in the last 7 days
    bit_count(cast('11111110000000000000000000000000' as bit(32)) & --bit-wise and
        cast(cast(sum(placeholder_int_value) as bigint) as bit(32))) > 0 as dim_is_weekly_active,
    -- daily is the same but with the 1st one only 1
    bit_count(cast('10000000000000000000000000000000' as bit(32)) & --bit-wise and
        cast(cast(sum(placeholder_int_value) as bigint) as bit(32))) > 0 as dim_is_daily_active
from place_holder_ints
group by user_id;

Lab 3 - Reducing fact data

-- week 2 lab 3

create table array_metrics (
    user_id numeric,
    month_start date,
    metric_name text,
    metric_array real[],
    primary key (user_id, month_start, metric_name)
);

insert into array_metrics
with daily_aggregate as (
    select
        user_id,
        date(event_time) as date,
        count(1) as num_site_hits
    from events
    where date(event_time) = date('2023-01-31')
    and user_id is not null
    group by user_id, date(event_time)
),
    yesterday_array as (
        select *
        from array_metrics
        where month_start = date('2023-01-01')
    )

select
    coalesce(da.user_id, ya.user_id) as user_id,
    coalesce(ya.month_start, date_trunc('month', da.date)) as month_start,
    'site_hits' as metric_name,
    case
        -- this is reverse order from day 2's lab
        -- we use coalesce and put a 0 if we are not fine with having null
        when ya.metric_array is not null then ya.metric_array || array[coalesce(da.num_site_hits, 0)]
        -- deal with cases where a new user shows up after the 1st of the month, so we add 0s at the start and then append the array
        -- we add the coalesce because that array cannot accept null values, but also if either of them is null then we just don't fill
        -- because we don't need to fill because that means it's the 1st day of the month
        when ya.metric_array is null then array_fill(0, array[coalesce(date - date(date_trunc('month', date)), 0)]) || array[coalesce(da.num_site_hits, 0)]
    end as metric_array
from daily_aggregate da full outer join yesterday_array ya
on da.user_id = ya.user_id
on conflict (user_id, month_start, metric_name)
do
    update set metric_array = excluded.metric_array;


-- we can do N day analysis
-- going from monthly array metrics to daily aggregates
-- but it's very fast as it's the minimal set of data we need
with agg as (
    select metric_name,
        month_start,
        array [
            sum(metric_array[1]),
            sum(metric_array[2]),
            sum(metric_array[3]),
            sum(metric_array[4])
        ] as summed_array
     from array_metrics
     group by metric_name, month_start
)
select metric_name,
       month_start + cast(cast(index-1 as text) || 'day' as interval),
       elem as value
from agg
    cross join unnest(agg.summed_array)
        with ordinality as a(elem, index)

That is all for today!

See you tomorrow :)