diff --git a/README.md b/README.md index 3c95b08030d..dcf86774c2a 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,39 @@ BigQuery ETL === Bigquery UDFs and SQL queries for building derived datasets. + +Recommended practices +=== + +- Should name sql files like `sql/destination_table_with_version.sql` e.g. + `sql/clients_daily_v6.sql` +- Should not specify a project or dataset in table names to simplify testing +- Should use incremental queries +- Should filter input tables on partition and clustering columns +- Should use UDF language `SQL` over `js` for performance +- Should use UDFs for reusability +- Should use query parameters over jinja templating + - Temporary issue: Airflow 1.10+ is required in order to use query parameters + +Incremental Queries +=== + +Incremental queries have these benefits: + +- BigQuery billing discounts for destination table partitions not modified in + the last 90 days +- Requires less airflow configuration +- Will have tooling to automate backfilling +- Will have tooling to replace partitions atomically to prevent duplicate data +- Will have tooling to generate an optimized "destination plus" view that + calculates the most recent partition + +Incremental queries have these properties: + +- Must accept a date via `@submission_date` query parameter + - Must output a column named `submission_date` matching the query parameter +- Must produce similar results when run multiple times + - Should produce identical results when run multiple times +- May depend on the previous partition + - If using previous partition, must include a `.init.sql` query to init the + first partition diff --git a/sql/clients_last_seen_v1.init.sql b/sql/clients_last_seen_v1.init.sql new file mode 100644 index 00000000000..2925843ecb0 --- /dev/null +++ b/sql/clients_last_seen_v1.init.sql @@ -0,0 +1,17 @@ +SELECT + @submission_date AS submission_date, + CURRENT_DATETIME() AS generated_time, + MAX(submission_date_s3) AS last_seen_date, + -- approximate LAST_VALUE(input).* + ARRAY_AGG(input + ORDER BY submission_date_s3 + DESC LIMIT 1 + )[OFFSET(0)].* EXCEPT (submission_date_s3) +FROM + clients_daily_v6 AS input +WHERE + submission_date_s3 <= @submission_date + AND + submission_date_s3 > DATE_SUB(@submission_date, INTERVAL 28 DAY) +GROUP BY + input.client_id diff --git a/sql/clients_last_seen_v1.sql b/sql/clients_last_seen_v1.sql new file mode 100644 index 00000000000..1f60abda3c4 --- /dev/null +++ b/sql/clients_last_seen_v1.sql @@ -0,0 +1,30 @@ +WITH current_sample AS ( + SELECT + submission_date_s3 AS last_seen_date, + * EXCEPT (submission_date_s3) + FROM + clients_daily_v6 + WHERE + submission_date_s3 = @submission_date +), previous AS ( + SELECT + * EXCEPT (submission_date, + generated_time) + FROM + analysis.clients_last_seen_v1 + WHERE + submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY) + AND last_seen_date > DATE_SUB(@submission_date, INTERVAL 28 DAY) +) +SELECT + @submission_date AS submission_date, + CURRENT_DATETIME() AS generated_time, + IF(current_sample.client_id IS NOT NULL, + current_sample, + previous).* +FROM + current_sample +FULL JOIN + previous +USING + (client_id) diff --git a/sql/firefox_desktop_exact_mau28_by_dimensions_v1.sql b/sql/firefox_desktop_exact_mau28_by_dimensions_v1.sql new file mode 100644 index 00000000000..027513f1750 --- /dev/null +++ b/sql/firefox_desktop_exact_mau28_by_dimensions_v1.sql @@ -0,0 +1,24 @@ +SELECT + submission_date, + CURRENT_DATETIME() AS generated_time, + COUNT(*) AS mau, + COUNTIF(last_seen_date = submission_date) AS dau, + -- requested fields from bug 1525689 + source, + medium, + campaign, + content, + country, + distribution_id +FROM + clients_last_seen_v1 +WHERE + submission_date = @submission_date +GROUP BY + submission_date, + source, + medium, + campaign, + content, + country, + distribution_id diff --git a/sql/firefox_desktop_exact_mau28.sql b/sql/firefox_desktop_exact_mau28_v1.sql similarity index 50% rename from sql/firefox_desktop_exact_mau28.sql rename to sql/firefox_desktop_exact_mau28_v1.sql index 886796a0bc6..24a4dd630ec 100644 --- a/sql/firefox_desktop_exact_mau28.sql +++ b/sql/firefox_desktop_exact_mau28_v1.sql @@ -2,9 +2,9 @@ SELECT @submission_date AS submission_date, CURRENT_DATETIME() AS generated_time, COUNT(DISTINCT client_id) AS mau, - SUM(CAST(submission_date_s3 = @submission_date AS INT64)) as dau + COUNTIF(submission_date_s3 = @submission_date) AS dau FROM - telemetry.clients_daily_v6 + clients_daily_v6 WHERE submission_date_s3 <= @submission_date - AND submission_date_s3 > DATE_ADD(@submission_date, INTERVAL -28 DAY) + AND submission_date_s3 > DATE_SUB(@submission_date, INTERVAL 28 DAY)