Initial commit

This commit is contained in:
Tim_Doernemann 2026-05-28 17:44:11 +02:00
parent d4028fca11
commit a578239c4f
32 changed files with 2559 additions and 0 deletions

View file

@ -0,0 +1,109 @@
import pendulum
import os
from airflow.sdk import dag, task
from airflow.models.connection import Connection
from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator
from stackit_workflows import config
# Airflow Connection "s3_papermill_output" must be configured to write notebook output to S3:
# host = <S3 endpoint URL> (used as BOTO3_ENDPOINT_URL)
# login = <AWS_ACCESS_KEY_ID>
# password = <AWS_SECRET_ACCESS_KEY>
try:
s3_connection = Connection.get_connection_from_secrets("s3_papermill_output")
except Exception:
import warnings
warnings.warn(
"Airflow connection 's3_papermill_output' is not configured. "
"This DAG will be visible but cannot be run until the connection is created. "
"Please create an Airflow connection with conn_id='s3_papermill_output': "
"host=<S3 endpoint URL>, login=<AWS_ACCESS_KEY_ID>, password=<AWS_SECRET_ACCESS_KEY>.",
stacklevel=2,
)
s3_connection = None
@dag(
schedule="0 7 * * *",
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
tags=["demo","stackit-demo"],
dag_id="stackit_13_demo_sales_usecase",
)
def spark_full_configuration():
if s3_connection is None:
# S3 connection not available — define a placeholder task so the DAG
# can still be imported and inspected.
@task()
def setup_required():
raise RuntimeError(
"Airflow connection 's3_papermill_output' is not configured. "
"Please create it with: host=<S3 endpoint URL>, "
"login=<AWS_ACCESS_KEY_ID>, password=<AWS_SECRET_ACCESS_KEY>."
)
setup_required()
return
output_s3 = STACKITSparkScriptOperator(
task_id="save_results_in_s3",
# image : str, optional
# Image of the launched container.
# If not specified, a current spark image maintained by stackit is used.
image="schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2",
# get_logs : bool, optional
# Read stdout get the stdout of the container as logs of the tasks. Default: True
get_logs=True,
# is_papermill : bool, optional
# Flag to define if the script defined in the `script` parameter
# is a jupyter notebook that is supposed to be executed using papermill.
# This parameter does not need to be set in order to use papermill. If
# it is not set the spark operator identifies notebooks automatically
# and uses papermill.
is_papermill=True,
# script : str
# Path of the script to execute. If a relative path is specified, it
# is interpreted relative to the git-sync folder.
# If an absolute path is specified it is not adapted further.
script="Demo/scripts/sales_prediction.ipynb",
# papermill_target_folder: str
# If papermill is supposed to be used, some target folder needs to be defined
# here in order to write the target notebook to the corresponding folder.
# If this is set to "-" the outpout is written to stdout.
papermill_target_folder="s3://internal-airflow-papermill-output/notebooks",
# s3_arguments : dict
# Arguments used to authenticate to S3. Should be used to pass
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and BOTO3_ENDPOINT_URL
# which are required for authentication to external s3 for pushing
# the results to.
s3_arguments={
"AWS_ACCESS_KEY_ID": s3_connection.login,
"AWS_SECRET_ACCESS_KEY": s3_connection.password,
"BOTO3_ENDPOINT_URL": s3_connection.host,
},
# dremio_connections : list of str, optional
# List of connection ids that are used to configure spark.
# The "lakehouse" connection is pre-configured and connects to
# the same lakehouse as Dremio.
dremio_connections=["lakehouse-rest"],
)
output_stdout = STACKITSparkScriptOperator(
task_id="run_prediction",
image="schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2",
get_logs=True,
is_papermill=True,
script="Demo/scripts/sales_prediction.ipynb",
papermill_target_folder="-",
s3_arguments={
"AWS_ACCESS_KEY_ID": s3_connection.login,
"AWS_SECRET_ACCESS_KEY": s3_connection.password,
"BOTO3_ENDPOINT_URL": s3_connection.host,
},
dremio_connections=["lakehouse-rest"],
)
output_stdout >> output_s3
spark_full_configuration()