workflows-example-dags/Demo/stackit-12-schedule-on-gpu-node.py
2026-05-28 17:44:11 +02:00

80 lines
2.8 KiB
Python

import json
import pendulum
from airflow.sdk import dag, task
from airflow.providers.cncf.kubernetes.operators.pod import (
KubernetesPodOperator,
)
from stackit_workflows.kubernetes import POD_NAMESPACE, K8S
from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator
from stackit_workflows.models import GpuConfig, MLFramework
@dag(
schedule=None,
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
tags=["demo","stackit-demo"],
dag_id="stackit_12_on_gpu_node",
)
def kubernetes_operator():
gpu_task = KubernetesPodOperator(
name="gpu-task",
#image=<use spark image with GPU support>,
image="debian",
cmds=["echo"],
arguments=["This task is running in a gpu node"],
task_id="run_task_on_gpu",
# Use affinity parameter to schedule pods on dedicated gpu nodes. In this example the node we want to
# schedule pods on is labelled dedicated=gpu. In this case the pod is REQUIRED to be scheduled on the
# specified node, but it can also be a more relaxed criteria where the schedule is PREFERRED (
# preferred_during_scheduling_ignored_during_execution).
affinity=K8S.V1Affinity(
node_affinity=K8S.V1NodeAffinity(
required_during_scheduling_ignored_during_execution=K8S.V1NodeSelector(
node_selector_terms=[
K8S.V1NodeSelectorTerm(
match_expressions=[
K8S.V1NodeSelectorRequirement(
key='dedicated',
operator='In',
values=['gpu']
)
]
)
]
)
)
),
# When nodes we want to schedule pods on have taints, we need to add the respective tolerations. In this
# example the gpu node has the taint dedicated=gpu:NoSchedule. You can pass a list of such tolerations that
# enables the pod to be scheduled on these tainted nodes dedicated for specific workload.
tolerations=[
K8S.V1Toleration(
key='dedicated',
operator='Equal',
value='gpu',
effect='NoSchedule'
)
],
# All pods launched must run as non-root users. Otherwise they won't start.
security_context=K8S.V1PodSecurityContext(run_as_user=100),
)
gpu_flag_test = STACKITSparkScriptOperator(
task_id="gpu_flag",
gpu=GpuConfig(ml_framework=MLFramework.PYTORCH),
script="Demo/scripts/basic_python_script.py",
image="python:3.12-slim-bullseye",
security_context={"runAsUser": 1000},
)
gpu_task
gpu_flag_test
kubernetes_operator()