workflows-example-dags/Demo/stackit-04-simple-spark.py
2026-05-28 17:44:11 +02:00

149 lines
6 KiB
Python

import pendulum
from airflow.sdk import dag, task
from stackit_workflows.airflow_plugin.decorators import stackit
from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator
# It is good practice but not required to specify the image to use.
default_kwargs = {
"image": "schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2"
}
@dag(
schedule=None,
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
tags=["demo","stackit-demo"],
dag_id="stackit_04_simple_spark",
)
def simple_spark():
# This is a regular task using the airflow taskflow API
@task()
def generate_number():
import random
return random.randint(1, 100)
# This is our own provider which makes it easy to run spark jobs from airflow.
# By default, this launches a single node spark with 1 CPU and 6 GB RAM.
@stackit.spark_kubernetes_task(**default_kwargs)
def spark_single_node(random_number: int):
import stackit_spark
import pandas as pd
spark = stackit_spark.get_spark()
df = spark.createDataFrame(pd.DataFrame({"random_number": [random_number]}))
df.show()
# If we need more resources, we can specify them with the `cpu` and `memory_gb` parameters.
# By default, memory is calculated as: Memory in GB = 6 * CPU.
# Memory can also be specified explicitly.
@stackit.spark_kubernetes_task(cpu=3, **default_kwargs)
def spark_single_node_large(random_number: int):
import stackit_spark
import pandas as pd
import time
spark = stackit_spark.get_spark()
data = pd.DataFrame({"random_number": [random_number]})
df = spark.createDataFrame(data)
df.show()
time.sleep(60)
# You can optionally specify the resources and also the number of executors.
# `cpu` and `memory_gb` set resources for the driver, `executor_cpu` and `executor_memory_gb`
# set resources for the executors.
# If memory is not specified, it is calculated based on the specified CPUs. (1 CPU = 6 GB RAM)
# `executors` sets the number of executors.
# This uses by default a stackit provided spark image. It can be overridden.
# For many tasks one spark node is enough.
# However, for very large transformations, we might need to spin up a cluster.
#
# Take this step only if you need more than ~16 CPUs and 128 GB RAM.
# Running Spark in cluster node can sometimes even hurt performance as
# data needs to be shuffled between nodes - and Network IO is often the bottleneck.
#
# `cpu` and `memory_gb` can still be used to configure the driver.
# Executors are configured with `executor_cpu` and `executor_memory_gb`.
# The number of executors can be set with `executors`. Each executor
# gets the resources specified in `executor_cpu` and `executor_memory_gb`.
@stackit.spark_kubernetes_task(cpu=2, executors=3, executor_cpu=2, **default_kwargs)
def spark_cluster(random_number: int):
import stackit_spark
import pandas as pd
import time
# You can customize spark here:
spark = stackit_spark.get_spark(
additional_config={
"spark.sql.shuffle.partitions": 200,
"spark.executor.memoryOverhead": "1g",
# We add additional jars from maven
"spark.jars.packages": "org.apache.hadoop:hadoop-azure-datalake:3.3.4,org.apache.hadoop:hadoop-azure:3.3.4,",
}
)
data = pd.DataFrame({"random_number": [random_number]})
df = spark.createDataFrame(data)
df.show()
sc = spark._jsc.sc()
executors = [
executor.host() for executor in sc.statusTracker().getExecutorInfos()
]
print(f"Executors: {executors}")
time.sleep(60)
# Instead of using the taskflow API, you can also use an operator and specify a script to
# be executed.
t1 = STACKITSparkScriptOperator(
task_id="spark_operator",
# You can use here all parameters that you already learned about above.
cpu=2,
# Specify a relative path here from the root of the DAGs Repository
script="Demo/scripts/my_spark_job.py",
**default_kwargs,
)
# When using the StackITSparkScriptOperator, the script is run in the context of
# the git repository it is contained in. This means you can use imports inside
# of scripts!
# Per default, the script is expected to be in the same repository as this DAG.
# If it is in another repository, you can specify the git_* arguments of the
# operator. Check the `05-spark-full-configuration.py` for an example.
t1 = STACKITSparkScriptOperator(
task_id="spark_operator_with_imports",
# You can use here all parameters that you already learned about above.
cpu=2,
# Specify a relative path here from the root of the DAGs Repository
script="Demo/scripts/my_spark_job_with_imports.py",
**default_kwargs,
)
# Even when using your own image, you can still use this operator to execute
# a python script in any git repository in the context of your image. You
# won't be able to use the `stackit_spark` module in this case.
# `python` needs to be installed in the image.
t1 = STACKITSparkScriptOperator(
task_id="run_script_in_custom_image",
# You can use here all parameters that you already learned about above.
# This will still set the resource requests of your pod!
cpu=2,
# Specify a relative path here from the root of the DAGs Repository
script="Demo/scripts/basic_python_script.py",
image="python:3.12-slim-bullseye",
# All pods must run as non-root users. We recommend to use images
# that already run as non-root users.
security_context={"runAsUser": 1000},
)
random_number = generate_number()
spark_single_node(random_number)
spark_single_node_large(random_number)
spark_cluster(random_number)
random_number >> t1
simple_spark()