Data Engineering SQL Holiday Specials#

Worked examples of new features released around the holiday season of 2024 which accompanies this blog here

To try and spice up what is essentially a list of features, I’ve categorised them into release stage (GA vs PuPr) as well as the Holly Smith scale of feature usefulness:

🐳 Extra Large - 🐘 Large - 🦩 Medium - πŸ¦” Small#

Requirements#

To run these you will need: A single user cluster of DBR 16.1.

To test out every feature in its fullest you will optionally need:

  • A serverless warehouse to run statements in the SQL editor [materialised views]

  • The ability to create DLT pipelines [materialised views]

  • Your UC metastore ID and a PAToken for an admin to enable system tables [warehouse system tables]

🐘 New SQL functions#

-- takes an encoded URL and attempts to decode it
-- encoded URLs are where characters that can't be ASCII encoded (+, ?, /, #, & etc) are replaces with a % followed by a hexidecimal number (ie where letters A-F are used as numbers after 0-9)
-- DBR 16.0, https://docs.databricks.com/en/sql/language-manual/functions/try_url_decode.html

-- this will work as it's a valid URL, and will return the decoded URL
SELECT try_url_decode('http%3A%2F%2Fnotebooks.databricks.com%2Fdevrel%2F') as decoded

UNION 
--this *won't* work as there's a missing F at the end, and will return a null value
SELECT try_url_decode('http%3A%2F%2Fnotebooks.databricks.com%2Fdevrel%2') as decoded;
decoded
http://notebooks.databricks.com/devrel/
null
-- returns a null value if the expression is zero, and vice versa
-- DBR 16.0, https://docs.databricks.com/en/sql/language-manual/functions/zeroifnull.html

--pass zeroifnull a null and it'll return a zero
SELECT null as input, "zeroifnull" as function, zeroifnull(null) as output

UNION
--pass zeroifnull a *non* null and it'll return that value
SELECT 42 as input, "zeroifnull" as function, zeroifnull(42) as output

UNION
--pass nullifzero a zero and it'll return a null
SELECT 0 as input, "nullifzero" as function, nullifzero(0) as output

UNION
--pass nullifzero a *non* null and it'll return that value
SELECT 42 as input, "nullifzero" as function, nullifzero(42) as output
inputfunctionoutput
nullzeroifnull0
42zeroifnull42
0nullifzeronull
42nullifzero42
-- Returns the day of the month from a valid date expression.
-- This is a synonym for extract(DAY FROM expr)
-- DBR 16.1, https://docs.databricks.com/en/sql/language-manual/functions/day.html

-- using the new day function
SELECT day('2019-01-07') as result
UNION ALL

-- the old synonym
SELECT extract(DAY FROM '2019-01-07') as result
UNION ALL

--checks for a valid date
SELECT day('2019-02-30') as result;
result
7
7
null
-- generates a random number between two specified values with a uniform distribution, ie even chances of getting any number 
-- there's an option to specify a seed to consistently get the same random number
-- DBR 16.1, https://docs.databricks.com/en/sql/language-manual/functions/uniform.html

-- select a random number between 10 and 20
-- if you run this cell multiple times the number will update
SELECT uniform(10, 20) as random_no
UNION ALL 

-- select a consistently random number between 10 and 20
-- if you run this cell multiple times the number *won't* update
SELECT uniform(10, 20, 42) as random_no
random_no
10
16
-- returns a random string of specified length
-- the string can contain uppercase, lowercase and numeric values 
-- there's an option to specify a seed to consistently get the same random string
-- DBR 16.1, https://docs.databricks.com/en/sql/language-manual/functions/randstr.html

-- creates a random string of length 10
SELECT randstr(10) as random_string
UNION ALL

-- creates a consistently random string of length 10
SELECT randstr(10, 42) as random_string;
random_string
TPw2OkGx5W
pll6YOIJNn
-- The following functions now support named parameter invocation: variant_explode, variant_explode_outer, inline, inline_outer, posexplode, posexplode_outer
-- this means you can have flexibility in the order and improve readability

--instead of this
SELECT pos, col FROM posexplode(array(10, 20));

-- you can do this
SELECT pos, col FROM posexplode(collection => array(10, 20));
poscol
010
120

🦩 Primary Key and Foreign Key relationship constraints#

CREATE TABLE ham_types(ham_type STRING NOT NULL, is_edible BOOLEAN, --standard table definition
                CONSTRAINT ham_key PRIMARY KEY(ham_type)); --name and define the constraint
CREATE TABLE sandwich_feasibility(ham_type STRING NOT NULL, is_feasible BOOLEAN,
                CONSTRAINT ham_sandwich_key FOREIGN KEY(ham_type) REFERENCES ham_types);
select * from  information_schema.key_column_usage
where constraint_name in ('ham_key','ham_sandwich_key');
constraint_catalogconstraint_schemaconstraint_nametable_catalogtable_schematable_namecolumn_nameordinal_positionposition_in_unique_constraint
shareddefaultham_sandwich_keyshareddefaultsandwich_feasibilityham_type11
shareddefaultham_keyshareddefaultham_typesham_type1null

🦩 Structured Streaming and liquid clustering compatibility improvements#

%python
(spark.readStream.format("rate") # generate some random data of timestamps and a value
  .load()
  .writeStream 
  .clusterBy("timestamp") # <- this bit is new and now possible 
  .option("checkpointLocation", "dbfs:/tmp/checkpoint/random_numbers")
  .trigger(once=True)  # let's not have this run forever
  .toTable("random_numbers") # new table created as part of this syntax
)
<pyspark.sql.streaming.query.StreamingQuery at 0x7f1992964e90>
describe extended random_numbers --check out rows 3-5, 24
col_namedata_typecomment
timestamptimestampnull
valuebigintnull
# Clustering Information
# col_namedata_typecomment
timestamptimestampnull
# Delta Statistics Columns
Column Namestimestamp, value
Column Selection Methodfirst-32
# Detailed Table Information
Catalogshared
Databasedefault
Tablerandom_numbers
Created TimeThu Jan 09 10:06:43 UTC 2025
Last AccessUNKNOWN
Created BySpark
TypeMANAGED
Locations3://databricks-devrel/unity-catalog/556849771150522/__unitystorage/catalogs/5af9d1b9-ce84-4845-a5f7-3a0ab902e978/tables/7919975b-c13a-4995-ba7b-ccfda121c61e
Providerdelta
Ownerholly.smith@databricks.com
Is_managed_locationtrue
Predictive OptimizationENABLE (inherited from METASTORE metastore_aws_us_west_2)
Table Properties[clusteringColumns=[["timestamp"]],delta.checkpointPolicy=v2,delta.enableDeletionVectors=true,delta.enableRowTracking=true,delta.feature.appendOnly=supported,delta.feature.clustering=supported,delta.feature.deletionVectors=supported,delta.feature.domainMetadata=supported,delta.feature.invariants=supported,delta.feature.rowTracking=supported,delta.feature.v2Checkpoint=supported,delta.minReaderVersion=3,delta.minWriterVersion=7,delta.rowTracking.materializedRowCommitVersionColumnName=_row-commit-version-col-0edb6827-803c-450f-b73b-1652fe05801e,delta.rowTracking.materializedRowIdColumnName=_row-id-col-a162077e-a282-4f39-8c9e-197ef9fa8a59]

🐘 Warehouse system tables#

%sh
curl -v -X PUT -H "Authorization: Bearer <Developer Access Token>" "https://<your workspace url>.cloud.databricks.com/api/2.0/unity-catalog/metastores/<metastore id found in UC>/systemschemas/compute"
select * from system.compute.warehouses
--sorry removing this result for sensitive data
--but just imagine the best practices you've ever seen

🐘 Materialised View Updates#

To schedule these awe inspiring examples of MVs, move them to the SQL editor otherwise DLT will try and run everything in this notebook

CREATE MATERIALIZED VIEW ham_feasibility
SCHEDULE EVERY 1 HOUR --new schedule syntax
AS SELECT `πŸ–`.ham_type, `πŸ₯ͺ`.is_feasible 
FROM ham_types AS `πŸ–`
LEFT JOIN sandwich_feasibility AS `πŸ₯ͺ`
ON `πŸ–`.ham_type = `πŸ₯ͺ`.ham_type;
ham_feasibility is defined as a Delta Live Tables dataset with schema:
Name Type
ham_type string
is_feasible boolean
To populate your table you must either:
  • Run an existing pipeline using the Delta Live Tables menu
  • Create a new pipeline: Create Pipeline
CREATE FUNCTION veggie_filter(is_edible BOOLEAN) --let's stop taunting our veggie friends
RETURN IF(IS_ACCOUNT_GROUP_MEMBER('omnivore'), false, is_edible=0); --if they're not part of the omnivore group, we'll hide piggy products

CREATE MATERIALIZED VIEW ham_options (ham_type STRING, is_edible BOOLEAN) --now this view will show results tailored to whether or not the user is part of the omnivore group stored in unity catalog
WITH ROW FILTER veggie_filter --apply the function we just defined
AS SELECT * FROM ham_types
ham_options is defined as a Delta Live Tables dataset with schema:
Name Type
ham_type string
is_edible boolean
To populate your table you must either:
  • Run an existing pipeline using the Delta Live Tables menu
  • Create a new pipeline: Create Pipeline
--uh oh! but what if someone is new and we don't know which group to put them in?
--the only sane thing to do is hide the contents of this column from them until they're added to a group
CREATE FUNCTION undefined_preference(is_edible BOOLEAN)
  RETURN CASE WHEN is_member('omnivore') THEN is_edible 
              WHEN is_member('veggie') THEN is_edible 
              ELSE NULL END;

CREATE OR REPLACE MATERIALIZED VIEW ham_options 
  (ham_type STRING, 
  is_edible BOOLEAN MASK undefined_preference) -- applying the column mask to our materialised view
WITH ROW FILTER veggie_filter
AS SELECT * FROM ham_types;
ham_options is defined as a Delta Live Tables dataset with schema:
Name Type
ham_type string
is_edible boolean
To populate your table you must either:
  • Run an existing pipeline using the Delta Live Tables menu
  • Create a new pipeline: Create Pipeline
CREATE MATERIALIZED VIEW preview_channel_example
TBLPROPERTIES(pipelines.channel = "PREVIEW") --set the compute to use features in public preview
AS SELECT day('2019-01-07') as result
preview_channel_example is defined as a Delta Live Tables dataset with schema:
Name Type
result int
To populate your table you must either:
  • Run an existing pipeline using the Delta Live Tables menu
  • Create a new pipeline: Create Pipeline

🦩 Lite mode for VACUUM#

VACUUM ham_types LITE;
path
s3://databricks-devrel/unity-catalog/556849771150522/__unitystorage/catalogs/5af9d1b9-ce84-4845-a5f7-3a0ab902e978/tables/0e26978e-85a9-46d6-b74a-d91f6421861f

🐷#