Data Engineering SQL Holiday Specials#
Worked examples of new features released around the holiday season of 2024 which accompanies this blog here
To try and spice up what is essentially a list of features, Iβve categorised them into release stage (GA vs PuPr) as well as the Holly Smith scale of feature usefulness:
π³ Extra Large - π Large - 𦩠Medium - π¦ Small#
Requirements#
To run these you will need: A single user cluster of DBR 16.1.
To test out every feature in its fullest you will optionally need:
A serverless warehouse to run statements in the SQL editor [materialised views]
The ability to create DLT pipelines [materialised views]
Your UC metastore ID and a PAToken for an admin to enable system tables [warehouse system tables]
π New SQL functions#
-- takes an encoded URL and attempts to decode it
-- encoded URLs are where characters that can't be ASCII encoded (+, ?, /, #, & etc) are replaces with a % followed by a hexidecimal number (ie where letters A-F are used as numbers after 0-9)
-- DBR 16.0, https://docs.databricks.com/en/sql/language-manual/functions/try_url_decode.html
-- this will work as it's a valid URL, and will return the decoded URL
SELECT try_url_decode('http%3A%2F%2Fnotebooks.databricks.com%2Fdevrel%2F') as decoded
UNION
--this *won't* work as there's a missing F at the end, and will return a null value
SELECT try_url_decode('http%3A%2F%2Fnotebooks.databricks.com%2Fdevrel%2') as decoded;
decoded |
---|
http://notebooks.databricks.com/devrel/ |
null |
-- returns a null value if the expression is zero, and vice versa
-- DBR 16.0, https://docs.databricks.com/en/sql/language-manual/functions/zeroifnull.html
--pass zeroifnull a null and it'll return a zero
SELECT null as input, "zeroifnull" as function, zeroifnull(null) as output
UNION
--pass zeroifnull a *non* null and it'll return that value
SELECT 42 as input, "zeroifnull" as function, zeroifnull(42) as output
UNION
--pass nullifzero a zero and it'll return a null
SELECT 0 as input, "nullifzero" as function, nullifzero(0) as output
UNION
--pass nullifzero a *non* null and it'll return that value
SELECT 42 as input, "nullifzero" as function, nullifzero(42) as output
input | function | output |
---|---|---|
null | zeroifnull | 0 |
42 | zeroifnull | 42 |
0 | nullifzero | null |
42 | nullifzero | 42 |
-- Returns the day of the month from a valid date expression.
-- This is a synonym for extract(DAY FROM expr)
-- DBR 16.1, https://docs.databricks.com/en/sql/language-manual/functions/day.html
-- using the new day function
SELECT day('2019-01-07') as result
UNION ALL
-- the old synonym
SELECT extract(DAY FROM '2019-01-07') as result
UNION ALL
--checks for a valid date
SELECT day('2019-02-30') as result;
result |
---|
7 |
7 |
null |
-- generates a random number between two specified values with a uniform distribution, ie even chances of getting any number
-- there's an option to specify a seed to consistently get the same random number
-- DBR 16.1, https://docs.databricks.com/en/sql/language-manual/functions/uniform.html
-- select a random number between 10 and 20
-- if you run this cell multiple times the number will update
SELECT uniform(10, 20) as random_no
UNION ALL
-- select a consistently random number between 10 and 20
-- if you run this cell multiple times the number *won't* update
SELECT uniform(10, 20, 42) as random_no
random_no |
---|
10 |
16 |
-- returns a random string of specified length
-- the string can contain uppercase, lowercase and numeric values
-- there's an option to specify a seed to consistently get the same random string
-- DBR 16.1, https://docs.databricks.com/en/sql/language-manual/functions/randstr.html
-- creates a random string of length 10
SELECT randstr(10) as random_string
UNION ALL
-- creates a consistently random string of length 10
SELECT randstr(10, 42) as random_string;
random_string |
---|
TPw2OkGx5W |
pll6YOIJNn |
-- The following functions now support named parameter invocation: variant_explode, variant_explode_outer, inline, inline_outer, posexplode, posexplode_outer
-- this means you can have flexibility in the order and improve readability
--instead of this
SELECT pos, col FROM posexplode(array(10, 20));
-- you can do this
SELECT pos, col FROM posexplode(collection => array(10, 20));
pos | col |
---|---|
0 | 10 |
1 | 20 |
𦩠Primary Key and Foreign Key relationship constraints#
CREATE TABLE ham_types(ham_type STRING NOT NULL, is_edible BOOLEAN, --standard table definition
CONSTRAINT ham_key PRIMARY KEY(ham_type)); --name and define the constraint
CREATE TABLE sandwich_feasibility(ham_type STRING NOT NULL, is_feasible BOOLEAN,
CONSTRAINT ham_sandwich_key FOREIGN KEY(ham_type) REFERENCES ham_types);
select * from information_schema.key_column_usage
where constraint_name in ('ham_key','ham_sandwich_key');
constraint_catalog | constraint_schema | constraint_name | table_catalog | table_schema | table_name | column_name | ordinal_position | position_in_unique_constraint |
---|---|---|---|---|---|---|---|---|
shared | default | ham_sandwich_key | shared | default | sandwich_feasibility | ham_type | 1 | 1 |
shared | default | ham_key | shared | default | ham_types | ham_type | 1 | null |
𦩠Structured Streaming and liquid clustering compatibility improvements#
%python
(spark.readStream.format("rate") # generate some random data of timestamps and a value
.load()
.writeStream
.clusterBy("timestamp") # <- this bit is new and now possible
.option("checkpointLocation", "dbfs:/tmp/checkpoint/random_numbers")
.trigger(once=True) # let's not have this run forever
.toTable("random_numbers") # new table created as part of this syntax
)
<pyspark.sql.streaming.query.StreamingQuery at 0x7f1992964e90>
describe extended random_numbers --check out rows 3-5, 24
col_name | data_type | comment |
---|---|---|
timestamp | timestamp | null |
value | bigint | null |
# Clustering Information | ||
# col_name | data_type | comment |
timestamp | timestamp | null |
# Delta Statistics Columns | ||
Column Names | timestamp, value | |
Column Selection Method | first-32 | |
# Detailed Table Information | ||
Catalog | shared | |
Database | default | |
Table | random_numbers | |
Created Time | Thu Jan 09 10:06:43 UTC 2025 | |
Last Access | UNKNOWN | |
Created By | Spark | |
Type | MANAGED | |
Location | s3://databricks-devrel/unity-catalog/556849771150522/__unitystorage/catalogs/5af9d1b9-ce84-4845-a5f7-3a0ab902e978/tables/7919975b-c13a-4995-ba7b-ccfda121c61e | |
Provider | delta | |
Owner | holly.smith@databricks.com | |
Is_managed_location | true | |
Predictive Optimization | ENABLE (inherited from METASTORE metastore_aws_us_west_2) | |
Table Properties | [clusteringColumns=[["timestamp"]],delta.checkpointPolicy=v2,delta.enableDeletionVectors=true,delta.enableRowTracking=true,delta.feature.appendOnly=supported,delta.feature.clustering=supported,delta.feature.deletionVectors=supported,delta.feature.domainMetadata=supported,delta.feature.invariants=supported,delta.feature.rowTracking=supported,delta.feature.v2Checkpoint=supported,delta.minReaderVersion=3,delta.minWriterVersion=7,delta.rowTracking.materializedRowCommitVersionColumnName=_row-commit-version-col-0edb6827-803c-450f-b73b-1652fe05801e,delta.rowTracking.materializedRowIdColumnName=_row-id-col-a162077e-a282-4f39-8c9e-197ef9fa8a59] |
π Warehouse system tables#
%sh
curl -v -X PUT -H "Authorization: Bearer <Developer Access Token>" "https://<your workspace url>.cloud.databricks.com/api/2.0/unity-catalog/metastores/<metastore id found in UC>/systemschemas/compute"
select * from system.compute.warehouses
--sorry removing this result for sensitive data
--but just imagine the best practices you've ever seen
π Materialised View Updates#
To schedule these awe inspiring examples of MVs, move them to the SQL editor otherwise DLT will try and run everything in this notebook
CREATE MATERIALIZED VIEW ham_feasibility
SCHEDULE EVERY 1 HOUR --new schedule syntax
AS SELECT `π`.ham_type, `π₯ͺ`.is_feasible
FROM ham_types AS `π`
LEFT JOIN sandwich_feasibility AS `π₯ͺ`
ON `π`.ham_type = `π₯ͺ`.ham_type;
Name | Type |
---|---|
ham_type | string |
is_feasible | boolean |
- Run an existing pipeline using the Delta Live Tables menu
- Create a new pipeline: Create Pipeline
CREATE FUNCTION veggie_filter(is_edible BOOLEAN) --let's stop taunting our veggie friends
RETURN IF(IS_ACCOUNT_GROUP_MEMBER('omnivore'), false, is_edible=0); --if they're not part of the omnivore group, we'll hide piggy products
CREATE MATERIALIZED VIEW ham_options (ham_type STRING, is_edible BOOLEAN) --now this view will show results tailored to whether or not the user is part of the omnivore group stored in unity catalog
WITH ROW FILTER veggie_filter --apply the function we just defined
AS SELECT * FROM ham_types
Name | Type |
---|---|
ham_type | string |
is_edible | boolean |
- Run an existing pipeline using the Delta Live Tables menu
- Create a new pipeline: Create Pipeline
--uh oh! but what if someone is new and we don't know which group to put them in?
--the only sane thing to do is hide the contents of this column from them until they're added to a group
CREATE FUNCTION undefined_preference(is_edible BOOLEAN)
RETURN CASE WHEN is_member('omnivore') THEN is_edible
WHEN is_member('veggie') THEN is_edible
ELSE NULL END;
CREATE OR REPLACE MATERIALIZED VIEW ham_options
(ham_type STRING,
is_edible BOOLEAN MASK undefined_preference) -- applying the column mask to our materialised view
WITH ROW FILTER veggie_filter
AS SELECT * FROM ham_types;
Name | Type |
---|---|
ham_type | string |
is_edible | boolean |
- Run an existing pipeline using the Delta Live Tables menu
- Create a new pipeline: Create Pipeline
CREATE MATERIALIZED VIEW preview_channel_example
TBLPROPERTIES(pipelines.channel = "PREVIEW") --set the compute to use features in public preview
AS SELECT day('2019-01-07') as result
Name | Type |
---|---|
result | int |
- Run an existing pipeline using the Delta Live Tables menu
- Create a new pipeline: Create Pipeline
𦩠Lite mode for VACUUM
#
VACUUM ham_types LITE;
path |
---|
s3://databricks-devrel/unity-catalog/556849771150522/__unitystorage/catalogs/5af9d1b9-ce84-4845-a5f7-3a0ab902e978/tables/0e26978e-85a9-46d6-b74a-d91f6421861f |