Initial commit
This commit is contained in:
parent
d4028fca11
commit
a578239c4f
32 changed files with 2559 additions and 0 deletions
150
Demo/stackit-08-table-maintenance.py
Normal file
150
Demo/stackit-08-table-maintenance.py
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
import pendulum
|
||||
|
||||
from airflow.sdk import dag, task, Connection
|
||||
from stackit_workflows.airflow_plugin.decorators import stackit
|
||||
|
||||
default_kwargs = {
|
||||
"image": "schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2"
|
||||
}
|
||||
|
||||
try:
|
||||
lakehouse_connection = Connection.get("lakehouse-rest")
|
||||
except Exception:
|
||||
import warnings
|
||||
warnings.warn(
|
||||
"Airflow connection 'lakehouse-rest' is not configured. "
|
||||
"This DAG will be visible but cannot be run until the connection is created. "
|
||||
"Please create an Airflow connection with conn_id='lakehouse-rest' — "
|
||||
"see the comments in Demo/06-lakehouse.py for the required fields.",
|
||||
stacklevel=2,
|
||||
)
|
||||
lakehouse_connection = None
|
||||
|
||||
@dag(
|
||||
schedule=None,
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
catchup=False,
|
||||
tags=["demo","stackit-demo"],
|
||||
dag_id="stackit_08_table_maintenance",
|
||||
)
|
||||
def table_maintenance():
|
||||
"""
|
||||
This DAG gives an example implementation on how you can operationalize your table maintenance using Airflow and Dremio.
|
||||
There are two examples: One for the OPTIMZE/expire_snapshot and one for the VACUUM/rewrite_data_files command.
|
||||
Full reference of all available iceberg optmization procedures can be found here:https://iceberg.apache.org/docs/1.8.0/spark-procedures/.
|
||||
For further information refer to the StackIT Data Platform User Documentation: https://itdoc.schwarz/display/STACKIT/Dremio+-+Iceberg+Table+Maintenance+Guide.
|
||||
"""
|
||||
|
||||
@stackit.spark_kubernetes_task(**default_kwargs, dremio_connections=["lakehouse-rest"])
|
||||
def vacuum_table_using_spark():
|
||||
"""
|
||||
Each write/update/delete/upsert/compaction in Iceberg produces a new snapshot while keeping the old data
|
||||
and metadata around for snapshot isolation and time travel. The expire_snapshots procedure can be used
|
||||
to remove older snapshots and their files which are no longer needed.
|
||||
"""
|
||||
import stackit_spark
|
||||
import os
|
||||
|
||||
for k, v in os.environ.items():
|
||||
print(f"{k}: {v}")
|
||||
|
||||
catalog_name_in_spark = "catalog-s3"
|
||||
target_table = "DEMO.user"
|
||||
|
||||
spark = stackit_spark.get_spark()
|
||||
spark.sql(f"USE `{catalog_name_in_spark}`")
|
||||
|
||||
result = spark.sql(f"CALL `{catalog_name_in_spark}`.system.expire_snapshots( \
|
||||
table => 'DEMO.user', \
|
||||
retain_last => 100 \
|
||||
)")
|
||||
|
||||
result.show()
|
||||
|
||||
@stackit.spark_kubernetes_task(**default_kwargs, dremio_connections=["lakehouse-rest"])
|
||||
def rewrite_data_files():
|
||||
"""
|
||||
Iceberg tracks each data file in a table. More data files leads to more metadata stored in manifest files,
|
||||
and small data files causes an unnecessary amount of metadata and less efficient queries from file open costs.
|
||||
Iceberg can compact data files in parallel using Spark with the rewriteDataFiles action.
|
||||
This will combine small files into larger files to reduce metadata overhead and runtime file open cost.
|
||||
"""
|
||||
import stackit_spark
|
||||
import os
|
||||
|
||||
for k, v in os.environ.items():
|
||||
print(f"{k}: {v}")
|
||||
|
||||
catalog_name_in_spark = "catalog-s3"
|
||||
target_table = "DEMO.user"
|
||||
|
||||
spark = stackit_spark.get_spark(
|
||||
additional_config={
|
||||
"spark.jars.packages": "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1"
|
||||
})
|
||||
spark.sql(f"USE `{catalog_name_in_spark}`")
|
||||
|
||||
result = spark.sql(f"CALL `{catalog_name_in_spark}`.system.rewrite_data_files(table => 'DEMO.user')")
|
||||
|
||||
result.show()
|
||||
|
||||
|
||||
@task(task_id="vacuum_catalog_using_dremio")
|
||||
def vacuum_catalog_using_dremio():
|
||||
import sys
|
||||
import pathlib
|
||||
|
||||
sys.path.append(str(pathlib.Path(__file__).parent))
|
||||
from tools.execute_dremio_query import initialize_flight, read_query_pandas
|
||||
from airflow.hooks.base import BaseHook
|
||||
|
||||
dremio_connection = BaseHook.get_connection("dremio-flight")
|
||||
print(f"Host: {dremio_connection.host}")
|
||||
print(f"Login: {dremio_connection.login}")
|
||||
print(f"Password: {dremio_connection.password}")
|
||||
print(f"Catalog: {dremio_connection.schema}")
|
||||
client, auth = initialize_flight(
|
||||
flight_uri=dremio_connection.host,
|
||||
username=dremio_connection.login,
|
||||
password=dremio_connection.password,
|
||||
)
|
||||
data = read_query_pandas(
|
||||
client,
|
||||
auth,
|
||||
query=f'VACUUM CATALOG "{dremio_connection.schema}";',
|
||||
)
|
||||
print(data)
|
||||
|
||||
@task(task_id="optimize_table_using_dremio")
|
||||
def optimize_table_using_dremio():
|
||||
import sys
|
||||
import pathlib
|
||||
|
||||
sys.path.append(str(pathlib.Path(__file__).parent))
|
||||
from tools.execute_dremio_query import initialize_flight, read_query_pandas
|
||||
from airflow.hooks.base import BaseHook
|
||||
|
||||
dremio_connection = BaseHook.get_connection("dremio-flight")
|
||||
print(f"Host: {dremio_connection.host}")
|
||||
print(f"Login: {dremio_connection.login}")
|
||||
print(f"Password: {dremio_connection.password}")
|
||||
print(f"Catalog: {dremio_connection.schema}")
|
||||
client, auth = initialize_flight(
|
||||
flight_uri=dremio_connection.host,
|
||||
username=dremio_connection.login,
|
||||
password=dremio_connection.password,
|
||||
)
|
||||
data = read_query_pandas(
|
||||
client,
|
||||
auth,
|
||||
query=f'''
|
||||
OPTIMIZE TABLE "{dremio_connection.schema}"."E2E_TEST"."bronze_user"
|
||||
REWRITE DATA (MIN_FILE_SIZE_MB=100, MAX_FILE_SIZE_MB=1000, TARGET_FILE_SIZE_MB=512);
|
||||
''',
|
||||
)
|
||||
print(data)
|
||||
|
||||
vacuum_table_using_spark() >> rewrite_data_files() >> [vacuum_catalog_using_dremio(), optimize_table_using_dremio()]
|
||||
|
||||
table_maintenance()
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue