141 lines
5.9 KiB
Python
141 lines
5.9 KiB
Python
import pendulum
|
|
|
|
from airflow.sdk import dag
|
|
from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator, STACKITPythonScriptOperator
|
|
from stackit_workflows.airflow_plugin.decorators import stackit
|
|
|
|
# -------------------------------------------------------------------------
|
|
# DEMO DAG: How to use the STACKIT Operators
|
|
# -------------------------------------------------------------------------
|
|
# This DAG demonstrates the different ways to run workloads on Kubernetes
|
|
# using the STACKIT operator suite. It covers three main scenarios:
|
|
#
|
|
# 1. Running Python code directly with the @stackit decorator
|
|
# 2. Running Spark workloads via the @stackit Spark decorator
|
|
# 3. Running standalone Python or Spark scripts with the ScriptOperators
|
|
#
|
|
# Each operator abstracts away boilerplate (like syncing the repo,
|
|
# configuring the namespace, injecting context, mounting secrets, etc.),
|
|
# so you can focus only on your workload.
|
|
# -------------------------------------------------------------------------
|
|
|
|
# It is good practice (but not required) to specify the image you want to run on.
|
|
# By default, the STACKIT Spark image contains Spark, Java, Python, and common libs.
|
|
default_kwargs = {
|
|
"image": "schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2"
|
|
}
|
|
|
|
|
|
@dag(
|
|
schedule=None,
|
|
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
|
catchup=False,
|
|
tags=["demo","stackit-demo"],
|
|
dag_id="stackit_14_stackit_operators",
|
|
)
|
|
def stackit_operators():
|
|
# ---------------------------------------------------------------------
|
|
# 1) PYTHON DECORATOR
|
|
# ---------------------------------------------------------------------
|
|
# The @stackit.python_kubernetes_task decorator is the simplest way
|
|
# to run Python code in Kubernetes.
|
|
#
|
|
# Features provided out-of-the-box:
|
|
# - Uses a prebuilt Python image with pandas, requests, etc.
|
|
# - Injects Airflow context (dag_id, run_id, task_id, etc.) as ENV vars
|
|
# - Git-syncs your DAG repo into the pod (so imports just work)
|
|
# - Runs in the correct namespace with STACKIT defaults
|
|
#
|
|
# This is the preferred way if you want to write Python directly inside
|
|
# your DAG file without worrying about container setup.
|
|
@stackit.python_kubernetes_task()
|
|
def stackit_decorated_python_kubernetes():
|
|
import os
|
|
|
|
# Example: import a helper module from the repo
|
|
from scripts.my_tools.say_hello import say_hello
|
|
say_hello()
|
|
|
|
# Show how Airflow context is injected as environment variables
|
|
task_id = os.environ["AIRFLOW__CONTEXT__TASK__TASK_ID"]
|
|
print(f"Task ID: {task_id}")
|
|
|
|
# Print all relevant injected ENV variables
|
|
env_vars = {
|
|
key: value
|
|
for key, value in os.environ.items()
|
|
if key.startswith("AIRFLOW__CONTEXT") or key.startswith("STACKIT__")
|
|
}
|
|
for key, value in sorted(env_vars.items()):
|
|
print(f"{key}: {value}")
|
|
|
|
# ---------------------------------------------------------------------
|
|
# 2) SPARK DECORATOR
|
|
# ---------------------------------------------------------------------
|
|
# The @stackit.spark_kubernetes_task decorator makes it easy to run
|
|
# Spark jobs from Airflow, without setting up Spark-on-K8s manually.
|
|
#
|
|
# Features:
|
|
# - Starts a single-node Spark cluster by default
|
|
# - Includes the stackit_spark helper library for easy SparkSession setup
|
|
# - Allows you to scale CPU/memory via parameters
|
|
#
|
|
# This is the preferred way to run Spark transformations as functions.
|
|
@stackit.spark_kubernetes_task(**default_kwargs)
|
|
def stackit_decorated_spark_kubernetes(random_number: int):
|
|
import stackit_spark
|
|
import pandas as pd
|
|
|
|
spark = stackit_spark.get_spark()
|
|
df = spark.createDataFrame(pd.DataFrame({"random_number": [random_number]}))
|
|
df.show()
|
|
|
|
# ---------------------------------------------------------------------
|
|
# 3) PYTHON SCRIPT OPERATOR
|
|
# ---------------------------------------------------------------------
|
|
# If you already have a Python script in your repository (instead of
|
|
# embedding code directly in the DAG), use STACKITPythonScriptOperator.
|
|
#
|
|
# Features:
|
|
# - Executes a .py file from your DAG repo
|
|
# - Still benefits from repo sync and injected Airflow context
|
|
# - Works with any image that has Python installed
|
|
#
|
|
# Note: Unlike the Spark operator, this does not provide the
|
|
# stackit_spark helper module, unless your image includes it.
|
|
stackit_python_kubernetes = STACKITPythonScriptOperator(
|
|
task_id="stackit_python_kubernetes",
|
|
script="Demo/scripts/basic_python_script.py", # relative to repo root
|
|
)
|
|
|
|
# ---------------------------------------------------------------------
|
|
# 4) SPARK SCRIPT OPERATOR
|
|
# ---------------------------------------------------------------------
|
|
# Use STACKITSparkScriptOperator if you have Spark jobs stored as
|
|
# standalone scripts (in this repo or in another git repo).
|
|
#
|
|
# Features:
|
|
# - Executes a .py Spark job script
|
|
# - Supports imports inside the script (thanks to git-sync)
|
|
# - Runs on the specified Spark image with configurable resources
|
|
#
|
|
# For cross-repo jobs, you can set `git_repo`, `git_ref`, etc.
|
|
stackit_spark_kubernetes = STACKITSparkScriptOperator(
|
|
task_id="stackit_spark_kubernetes",
|
|
script="Demo/scripts/my_spark_job_with_imports.py", # relative to repo root
|
|
cpu=2,
|
|
**default_kwargs,
|
|
)
|
|
|
|
# ---------------------------------------------------------------------
|
|
# DAG FLOW
|
|
# ---------------------------------------------------------------------
|
|
# First run some inline Python, then a Spark function,
|
|
# then trigger the script-based operators.
|
|
stackit_decorated_python_kubernetes() >> stackit_decorated_spark_kubernetes(12) >> [
|
|
stackit_python_kubernetes,
|
|
stackit_spark_kubernetes,
|
|
]
|
|
|
|
|
|
stackit_operators()
|