Hello :) Today is Day 328!
A quick summary of today:
- learned a tonne about fact data modelling ๐
Week 2 Fact data modelling
Today Zach Wilson released the week 2 content in one 4 hour video.
There are in total 3 lectures and 3 labs. I watched the lecture versions in the dataexpert.io platform which are from the v3 version and updated so unfortunately I cannot share my notes from them. In order to share something here, I followed along the 3 labs.
The videos were all amazing, and here is my code with some small comments from the 3 labs
Lab 1 - making a fact table
-- fact data modelling lab 1
-- check if game_id, team_id, player_id are unique
-- we find there is (almost) 2 of every record
select
game_id, team_id, player_id, count(1)
from game_details
group by game_id, team_id, player_id
having count(1) > 1;
-- Thing to remember when doing fact data modelling
-- are the cols we are giving even useful?
-- don't put any derived values in the fact table, let the analyst do w/e they want
-- when logging we might get double data
-- so it's common to dedupe as the 1st step
insert into fct_game_details
with deduped as (
select
g.game_date_est,
g.season,
g.home_team_id,
gd.*,
row_number() over (partition by gd.game_id, team_id, player_id order by g.game_date_est) as row_num
from game_details gd join games g on gd.game_id = g.game_id
)
select
-- all these cols are a fundamental nature of the fact
-- we might want to put all IDs first then dim them m
game_date_est as dim_game_date,
season as dim_season,
team_id as dim_team_id,
player_id as dim_player_id,
player_name as dim_player_name,
start_position as dim_start_position,
team_id = home_team_id as dim_is_playing_at_home,
coalesce(position('DNP' in comment), 0) > 0 as dim_did_not_play,
coalesce(position('DND' in comment), 0) > 0 as dim_did_not_dress,
coalesce(position('NWT' in comment), 0) > 0 as dim_not_with_team,
-- comment -- we can remove it if we are confident that we have parsed all options
cast(split_part(min, ':', 1) as real)
+ cast(split_part(min, ':', 2) as real)/60 as m_minutes,
fgm as m_fgm,
fga as m_fga,
fg3m as m_fg3m,
fg3a as m_fg3a,
ftm as m_ftm,
fta as m_fta,
oreb as m_oreb,
dreb as m_dreb,
reb as m_reb,
ast as m_ast,
stl as m_stl,
blk as m_blk,
"TO" as m_turnovers,
pf as m_pf,
pts as m_pts,
plus_minus as m_plus_minus
from deduped
where row_num = 1;
create table fct_game_details (
-- dim_ are cols that you should group by and filter on
dim_game_date date,
dim_season integer,
dim_team_id integer,
dim_player_id integer,
dim_player_name text,
dim_start_position text,
dim_is_playing_at_home boolean,
dim_did_not_play boolean,
dim_did_not_dress boolean,
dim_not_with_team boolean,
-- measures -> m_
-- m_ are cols that you should agg and to math
m_minutes real,
m_fgm integer,
m_fga integer,
m_fg3m integer,
m_fg3a integer,
m_ftm integer,
m_fta integer,
m_oreb integer,
m_dreb integer,
m_reb integer,
m_ast integer,
m_stl integer,
m_blk integer,
m_turnovers integer,
m_pf integer,
m_pts integer,
m_plus_minus integer,
-- the PK helps create indexes which is good if we do filtering on cols in the PK
primary key (dim_game_date, dim_team_id, dim_player_id)
);
-- we may have lost the team info but we can easily bring them in
select t.*, gd.*
from fct_game_details gd join teams t
on t.team_id = gd.dim_team_id;
-- lets find the players that bailed out on the most games
-- fact data modelling is about making tables that are easy to query to create cool things
select dim_player_name,
count(1) as num_games,
count(case when dim_not_with_team then 1 end) as bailed_num,
cast(count(case when dim_not_with_team then 1 end) as real)/count(1) as bail_pct
from fct_game_details
group by 1
order by 4 desc;
Lab 2 - Learning about DateList ๐คฏ
-- week 2 lab 2
create table users_cumulated (
user_id text,
-- lists of dates in the past where the user was active
dates_active date[],
-- current date for the user
date date,
primary key (user_id, date)
);
insert into users_cumulated
with yesterday as (
select *
from users_cumulated
where date = date('2023-01-30')
), today as (
select
cast(user_id as text) as user_id,
date(cast(event_time as timestamp)) as date_active
from events
where
date(cast(event_time as timestamp)) = date('2023-01-31')
and user_id is not null -- deal with null user_ids in this data
group by user_id, date(cast(event_time as timestamp))
)
select
coalesce(t.user_id, y.user_id) as user_id,
case
when y.dates_active is null then array[t.date_active]
when t.date_active is null then y.dates_active -- we don't want to keep adding a big array of nulls
else array[t.date_active] || y.dates_active
end as dates_active,
-- today's date_active might not be date if the user doesn't exist yet
-- so we add 1 to yesterday's
coalesce(t.date_active, y.date + interval '1 day') as date
from today t full outer join yesterday y
on t.user_id = y.user_id;
-- generate a datelist for 30 days
with users as (
select * from users_cumulated
where date = date('2023-01-31')
),
series as (
select * from generate_series(date('2023-01-01'), date('2023-01-31'), interval '1 day') as series_date
),
place_holder_ints as (
select
case
when dates_active @> array[date(series_date)]
-- date - series_date is # of days b/e current date and series date
-- if we cast a power of 2 number as bits and turn it into binary
-- then we can get a history of 1s and 0s active/inactive
then cast(pow(2, 32 - (date - date(series_date))) as bigint)
else 0
end as placeholder_int_value,
*
from users cross join series -- we got the 31 days for each user
)
select
user_id,
-- these are extremely efficient operations
-- bit_count() can give us how many times the user is active
bit_count(cast(cast(sum(placeholder_int_value) as bigint) as bit(32))) > 0 as dim_is_monthly_active,
-- let's check a user is active in the last 7 days
bit_count(cast('11111110000000000000000000000000' as bit(32)) & --bit-wise and
cast(cast(sum(placeholder_int_value) as bigint) as bit(32))) > 0 as dim_is_weekly_active,
-- daily is the same but with the 1st one only 1
bit_count(cast('10000000000000000000000000000000' as bit(32)) & --bit-wise and
cast(cast(sum(placeholder_int_value) as bigint) as bit(32))) > 0 as dim_is_daily_active
from place_holder_ints
group by user_id;
Lab 3 - Reducing fact data
-- week 2 lab 3
create table array_metrics (
user_id numeric,
month_start date,
metric_name text,
metric_array real[],
primary key (user_id, month_start, metric_name)
);
insert into array_metrics
with daily_aggregate as (
select
user_id,
date(event_time) as date,
count(1) as num_site_hits
from events
where date(event_time) = date('2023-01-31')
and user_id is not null
group by user_id, date(event_time)
),
yesterday_array as (
select *
from array_metrics
where month_start = date('2023-01-01')
)
select
coalesce(da.user_id, ya.user_id) as user_id,
coalesce(ya.month_start, date_trunc('month', da.date)) as month_start,
'site_hits' as metric_name,
case
-- this is reverse order from day 2's lab
-- we use coalesce and put a 0 if we are not fine with having null
when ya.metric_array is not null then ya.metric_array || array[coalesce(da.num_site_hits, 0)]
-- deal with cases where a new user shows up after the 1st of the month, so we add 0s at the start and then append the array
-- we add the coalesce because that array cannot accept null values, but also if either of them is null then we just don't fill
-- because we don't need to fill because that means it's the 1st day of the month
when ya.metric_array is null then array_fill(0, array[coalesce(date - date(date_trunc('month', date)), 0)]) || array[coalesce(da.num_site_hits, 0)]
end as metric_array
from daily_aggregate da full outer join yesterday_array ya
on da.user_id = ya.user_id
on conflict (user_id, month_start, metric_name)
do
update set metric_array = excluded.metric_array;
-- we can do N day analysis
-- going from monthly array metrics to daily aggregates
-- but it's very fast as it's the minimal set of data we need
with agg as (
select metric_name,
month_start,
array [
sum(metric_array[1]),
sum(metric_array[2]),
sum(metric_array[3]),
sum(metric_array[4])
] as summed_array
from array_metrics
group by metric_name, month_start
)
select metric_name,
month_start + cast(cast(index-1 as text) || 'day' as interval),
elem as value
from agg
cross join unnest(agg.summed_array)
with ordinality as a(elem, index)
That is all for today!
See you tomorrow :)