import pendulum from airflow.sdk import dag from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator, STACKITPythonScriptOperator from stackit_workflows.airflow_plugin.decorators import stackit # ------------------------------------------------------------------------- # DEMO DAG: How to use the STACKIT Operators # ------------------------------------------------------------------------- # This DAG demonstrates the different ways to run workloads on Kubernetes # using the STACKIT operator suite. It covers three main scenarios: # # 1. Running Python code directly with the @stackit decorator # 2. Running Spark workloads via the @stackit Spark decorator # 3. Running standalone Python or Spark scripts with the ScriptOperators # # Each operator abstracts away boilerplate (like syncing the repo, # configuring the namespace, injecting context, mounting secrets, etc.), # so you can focus only on your workload. # ------------------------------------------------------------------------- # It is good practice (but not required) to specify the image you want to run on. # By default, the STACKIT Spark image contains Spark, Java, Python, and common libs. default_kwargs = { "image": "schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2" } @dag( schedule=None, start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=["demo","stackit-demo"], dag_id="stackit_14_stackit_operators", ) def stackit_operators(): # --------------------------------------------------------------------- # 1) PYTHON DECORATOR # --------------------------------------------------------------------- # The @stackit.python_kubernetes_task decorator is the simplest way # to run Python code in Kubernetes. # # Features provided out-of-the-box: # - Uses a prebuilt Python image with pandas, requests, etc. # - Injects Airflow context (dag_id, run_id, task_id, etc.) as ENV vars # - Git-syncs your DAG repo into the pod (so imports just work) # - Runs in the correct namespace with STACKIT defaults # # This is the preferred way if you want to write Python directly inside # your DAG file without worrying about container setup. @stackit.python_kubernetes_task() def stackit_decorated_python_kubernetes(): import os # Example: import a helper module from the repo from scripts.my_tools.say_hello import say_hello say_hello() # Show how Airflow context is injected as environment variables task_id = os.environ["AIRFLOW__CONTEXT__TASK__TASK_ID"] print(f"Task ID: {task_id}") # Print all relevant injected ENV variables env_vars = { key: value for key, value in os.environ.items() if key.startswith("AIRFLOW__CONTEXT") or key.startswith("STACKIT__") } for key, value in sorted(env_vars.items()): print(f"{key}: {value}") # --------------------------------------------------------------------- # 2) SPARK DECORATOR # --------------------------------------------------------------------- # The @stackit.spark_kubernetes_task decorator makes it easy to run # Spark jobs from Airflow, without setting up Spark-on-K8s manually. # # Features: # - Starts a single-node Spark cluster by default # - Includes the stackit_spark helper library for easy SparkSession setup # - Allows you to scale CPU/memory via parameters # # This is the preferred way to run Spark transformations as functions. @stackit.spark_kubernetes_task(**default_kwargs) def stackit_decorated_spark_kubernetes(random_number: int): import stackit_spark import pandas as pd spark = stackit_spark.get_spark() df = spark.createDataFrame(pd.DataFrame({"random_number": [random_number]})) df.show() # --------------------------------------------------------------------- # 3) PYTHON SCRIPT OPERATOR # --------------------------------------------------------------------- # If you already have a Python script in your repository (instead of # embedding code directly in the DAG), use STACKITPythonScriptOperator. # # Features: # - Executes a .py file from your DAG repo # - Still benefits from repo sync and injected Airflow context # - Works with any image that has Python installed # # Note: Unlike the Spark operator, this does not provide the # stackit_spark helper module, unless your image includes it. stackit_python_kubernetes = STACKITPythonScriptOperator( task_id="stackit_python_kubernetes", script="Demo/scripts/basic_python_script.py", # relative to repo root ) # --------------------------------------------------------------------- # 4) SPARK SCRIPT OPERATOR # --------------------------------------------------------------------- # Use STACKITSparkScriptOperator if you have Spark jobs stored as # standalone scripts (in this repo or in another git repo). # # Features: # - Executes a .py Spark job script # - Supports imports inside the script (thanks to git-sync) # - Runs on the specified Spark image with configurable resources # # For cross-repo jobs, you can set `git_repo`, `git_ref`, etc. stackit_spark_kubernetes = STACKITSparkScriptOperator( task_id="stackit_spark_kubernetes", script="Demo/scripts/my_spark_job_with_imports.py", # relative to repo root cpu=2, **default_kwargs, ) # --------------------------------------------------------------------- # DAG FLOW # --------------------------------------------------------------------- # First run some inline Python, then a Spark function, # then trigger the script-based operators. stackit_decorated_python_kubernetes() >> stackit_decorated_spark_kubernetes(12) >> [ stackit_python_kubernetes, stackit_spark_kubernetes, ] stackit_operators()