workflows-example-dags/Demo/stackit-14-stackit-operators.py
2026-05-28 17:44:11 +02:00

141 lines
5.9 KiB
Python

import pendulum
from airflow.sdk import dag
from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator, STACKITPythonScriptOperator
from stackit_workflows.airflow_plugin.decorators import stackit
# -------------------------------------------------------------------------
# DEMO DAG: How to use the STACKIT Operators
# -------------------------------------------------------------------------
# This DAG demonstrates the different ways to run workloads on Kubernetes
# using the STACKIT operator suite. It covers three main scenarios:
#
# 1. Running Python code directly with the @stackit decorator
# 2. Running Spark workloads via the @stackit Spark decorator
# 3. Running standalone Python or Spark scripts with the ScriptOperators
#
# Each operator abstracts away boilerplate (like syncing the repo,
# configuring the namespace, injecting context, mounting secrets, etc.),
# so you can focus only on your workload.
# -------------------------------------------------------------------------
# It is good practice (but not required) to specify the image you want to run on.
# By default, the STACKIT Spark image contains Spark, Java, Python, and common libs.
default_kwargs = {
"image": "schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2"
}
@dag(
schedule=None,
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
tags=["demo","stackit-demo"],
dag_id="stackit_14_stackit_operators",
)
def stackit_operators():
# ---------------------------------------------------------------------
# 1) PYTHON DECORATOR
# ---------------------------------------------------------------------
# The @stackit.python_kubernetes_task decorator is the simplest way
# to run Python code in Kubernetes.
#
# Features provided out-of-the-box:
# - Uses a prebuilt Python image with pandas, requests, etc.
# - Injects Airflow context (dag_id, run_id, task_id, etc.) as ENV vars
# - Git-syncs your DAG repo into the pod (so imports just work)
# - Runs in the correct namespace with STACKIT defaults
#
# This is the preferred way if you want to write Python directly inside
# your DAG file without worrying about container setup.
@stackit.python_kubernetes_task()
def stackit_decorated_python_kubernetes():
import os
# Example: import a helper module from the repo
from scripts.my_tools.say_hello import say_hello
say_hello()
# Show how Airflow context is injected as environment variables
task_id = os.environ["AIRFLOW__CONTEXT__TASK__TASK_ID"]
print(f"Task ID: {task_id}")
# Print all relevant injected ENV variables
env_vars = {
key: value
for key, value in os.environ.items()
if key.startswith("AIRFLOW__CONTEXT") or key.startswith("STACKIT__")
}
for key, value in sorted(env_vars.items()):
print(f"{key}: {value}")
# ---------------------------------------------------------------------
# 2) SPARK DECORATOR
# ---------------------------------------------------------------------
# The @stackit.spark_kubernetes_task decorator makes it easy to run
# Spark jobs from Airflow, without setting up Spark-on-K8s manually.
#
# Features:
# - Starts a single-node Spark cluster by default
# - Includes the stackit_spark helper library for easy SparkSession setup
# - Allows you to scale CPU/memory via parameters
#
# This is the preferred way to run Spark transformations as functions.
@stackit.spark_kubernetes_task(**default_kwargs)
def stackit_decorated_spark_kubernetes(random_number: int):
import stackit_spark
import pandas as pd
spark = stackit_spark.get_spark()
df = spark.createDataFrame(pd.DataFrame({"random_number": [random_number]}))
df.show()
# ---------------------------------------------------------------------
# 3) PYTHON SCRIPT OPERATOR
# ---------------------------------------------------------------------
# If you already have a Python script in your repository (instead of
# embedding code directly in the DAG), use STACKITPythonScriptOperator.
#
# Features:
# - Executes a .py file from your DAG repo
# - Still benefits from repo sync and injected Airflow context
# - Works with any image that has Python installed
#
# Note: Unlike the Spark operator, this does not provide the
# stackit_spark helper module, unless your image includes it.
stackit_python_kubernetes = STACKITPythonScriptOperator(
task_id="stackit_python_kubernetes",
script="Demo/scripts/basic_python_script.py", # relative to repo root
)
# ---------------------------------------------------------------------
# 4) SPARK SCRIPT OPERATOR
# ---------------------------------------------------------------------
# Use STACKITSparkScriptOperator if you have Spark jobs stored as
# standalone scripts (in this repo or in another git repo).
#
# Features:
# - Executes a .py Spark job script
# - Supports imports inside the script (thanks to git-sync)
# - Runs on the specified Spark image with configurable resources
#
# For cross-repo jobs, you can set `git_repo`, `git_ref`, etc.
stackit_spark_kubernetes = STACKITSparkScriptOperator(
task_id="stackit_spark_kubernetes",
script="Demo/scripts/my_spark_job_with_imports.py", # relative to repo root
cpu=2,
**default_kwargs,
)
# ---------------------------------------------------------------------
# DAG FLOW
# ---------------------------------------------------------------------
# First run some inline Python, then a Spark function,
# then trigger the script-based operators.
stackit_decorated_python_kubernetes() >> stackit_decorated_spark_kubernetes(12) >> [
stackit_python_kubernetes,
stackit_spark_kubernetes,
]
stackit_operators()