workflows-example-dags/Demo/stackit-14-stackit-operators.py

import pendulum

from airflow.sdk import dag
from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator, STACKITPythonScriptOperator
from stackit_workflows.airflow_plugin.decorators import stackit

# -------------------------------------------------------------------------
# DEMO DAG: How to use the STACKIT Operators
# -------------------------------------------------------------------------
# This DAG demonstrates the different ways to run workloads on Kubernetes
# using the STACKIT operator suite. It covers three main scenarios:
#
# 1. Running Python code directly with the @stackit decorator
# 2. Running Spark workloads via the @stackit Spark decorator
# 3. Running standalone Python or Spark scripts with the ScriptOperators
#
# Each operator abstracts away boilerplate (like syncing the repo,
# configuring the namespace, injecting context, mounting secrets, etc.),
# so you can focus only on your workload.
# -------------------------------------------------------------------------

# It is good practice (but not required) to specify the image you want to run on.
# By default, the STACKIT Spark image contains Spark, Java, Python, and common libs.
default_kwargs = {
    "image": "schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2"
}


@dag(
    schedule=None,
    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
    catchup=False,
    tags=["demo","stackit-demo"],
    dag_id="stackit_14_stackit_operators",
)
def stackit_operators():
    # ---------------------------------------------------------------------
    # 1) PYTHON DECORATOR
    # ---------------------------------------------------------------------
    # The @stackit.python_kubernetes_task decorator is the simplest way
    # to run Python code in Kubernetes.
    #
    # Features provided out-of-the-box:
    # - Uses a prebuilt Python image with pandas, requests, etc.
    # - Injects Airflow context (dag_id, run_id, task_id, etc.) as ENV vars
    # - Git-syncs your DAG repo into the pod (so imports just work)
    # - Runs in the correct namespace with STACKIT defaults
    #
    # This is the preferred way if you want to write Python directly inside
    # your DAG file without worrying about container setup.
    @stackit.python_kubernetes_task()
    def stackit_decorated_python_kubernetes():
        import os

        # Example: import a helper module from the repo
        from scripts.my_tools.say_hello import say_hello
        say_hello()

        # Show how Airflow context is injected as environment variables
        task_id = os.environ["AIRFLOW__CONTEXT__TASK__TASK_ID"]
        print(f"Task ID: {task_id}")

        # Print all relevant injected ENV variables
        env_vars = {
            key: value
            for key, value in os.environ.items()
            if key.startswith("AIRFLOW__CONTEXT") or key.startswith("STACKIT__")
        }
        for key, value in sorted(env_vars.items()):
            print(f"{key}: {value}")

    # ---------------------------------------------------------------------
    # 2) SPARK DECORATOR
    # ---------------------------------------------------------------------
    # The @stackit.spark_kubernetes_task decorator makes it easy to run
    # Spark jobs from Airflow, without setting up Spark-on-K8s manually.
    #
    # Features:
    # - Starts a single-node Spark cluster by default
    # - Includes the stackit_spark helper library for easy SparkSession setup
    # - Allows you to scale CPU/memory via parameters
    #
    # This is the preferred way to run Spark transformations as functions.
    @stackit.spark_kubernetes_task(**default_kwargs)
    def stackit_decorated_spark_kubernetes(random_number: int):
        import stackit_spark
        import pandas as pd

        spark = stackit_spark.get_spark()
        df = spark.createDataFrame(pd.DataFrame({"random_number": [random_number]}))
        df.show()

    # ---------------------------------------------------------------------
    # 3) PYTHON SCRIPT OPERATOR
    # ---------------------------------------------------------------------
    # If you already have a Python script in your repository (instead of
    # embedding code directly in the DAG), use STACKITPythonScriptOperator.
    #
    # Features:
    # - Executes a .py file from your DAG repo
    # - Still benefits from repo sync and injected Airflow context
    # - Works with any image that has Python installed
    #
    # Note: Unlike the Spark operator, this does not provide the
    # stackit_spark helper module, unless your image includes it.
    stackit_python_kubernetes = STACKITPythonScriptOperator(
        task_id="stackit_python_kubernetes",
        script="Demo/scripts/basic_python_script.py",  # relative to repo root
    )

    # ---------------------------------------------------------------------
    # 4) SPARK SCRIPT OPERATOR
    # ---------------------------------------------------------------------
    # Use STACKITSparkScriptOperator if you have Spark jobs stored as
    # standalone scripts (in this repo or in another git repo).
    #
    # Features:
    # - Executes a .py Spark job script
    # - Supports imports inside the script (thanks to git-sync)
    # - Runs on the specified Spark image with configurable resources
    #
    # For cross-repo jobs, you can set `git_repo`, `git_ref`, etc.
    stackit_spark_kubernetes = STACKITSparkScriptOperator(
        task_id="stackit_spark_kubernetes",
        script="Demo/scripts/my_spark_job_with_imports.py",  # relative to repo root
        cpu=2,
        **default_kwargs,
    )

    # ---------------------------------------------------------------------
    # DAG FLOW
    # ---------------------------------------------------------------------
    # First run some inline Python, then a Spark function,
    # then trigger the script-based operators.
    stackit_decorated_python_kubernetes() >> stackit_decorated_spark_kubernetes(12) >> [
        stackit_python_kubernetes,
        stackit_spark_kubernetes,
    ]


stackit_operators()