workflows-example-dags/Demo/stackit-04-simple-spark.py

import pendulum

from airflow.sdk import dag, task
from stackit_workflows.airflow_plugin.decorators import stackit
from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator

# It is good practice but not required to specify the image to use.
default_kwargs = {
    "image": "schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2"
}


@dag(
    schedule=None,
    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
    catchup=False,
    tags=["demo","stackit-demo"],
    dag_id="stackit_04_simple_spark",
)
def simple_spark():
    # This is a regular task using the airflow taskflow API
    @task()
    def generate_number():
        import random

        return random.randint(1, 100)

    # This is our own provider which makes it easy to run spark jobs from airflow.
    # By default, this launches a single node spark with 1 CPU and 6 GB RAM.
    @stackit.spark_kubernetes_task(**default_kwargs)
    def spark_single_node(random_number: int):
        import stackit_spark
        import pandas as pd

        spark = stackit_spark.get_spark()
        df = spark.createDataFrame(pd.DataFrame({"random_number": [random_number]}))
        df.show()

    # If we need more resources, we can specify them with the `cpu` and `memory_gb` parameters.
    # By default, memory is calculated as: Memory in GB = 6 * CPU.
    # Memory can also be specified explicitly.
    @stackit.spark_kubernetes_task(cpu=3, **default_kwargs)
    def spark_single_node_large(random_number: int):
        import stackit_spark
        import pandas as pd
        import time

        spark = stackit_spark.get_spark()
        data = pd.DataFrame({"random_number": [random_number]})
        df = spark.createDataFrame(data)
        df.show()
        time.sleep(60)

    # You can optionally specify the resources and also the number of executors.
    # `cpu` and `memory_gb` set resources for the driver, `executor_cpu` and `executor_memory_gb`
    # set resources for the executors.
    # If memory is not specified, it is calculated based on the specified CPUs. (1 CPU = 6 GB RAM)
    # `executors` sets the number of executors.
    # This uses by default a stackit provided spark image. It can be overridden.

    # For many tasks one spark node is enough.
    # However, for very large transformations, we might need to spin up a cluster.
    #
    # Take this step only if you need more than ~16 CPUs and 128 GB RAM.
    # Running Spark in cluster node can sometimes even hurt performance as
    # data needs to be shuffled between nodes - and Network IO is often the bottleneck.
    #
    # `cpu` and `memory_gb` can still be used to configure the driver.
    # Executors are configured with `executor_cpu` and `executor_memory_gb`.
    # The number of executors can be set with `executors`. Each executor
    # gets the resources specified in `executor_cpu` and `executor_memory_gb`.
    @stackit.spark_kubernetes_task(cpu=2, executors=3, executor_cpu=2, **default_kwargs)
    def spark_cluster(random_number: int):
        import stackit_spark
        import pandas as pd
        import time

        # You can customize spark here:
        spark = stackit_spark.get_spark(
            additional_config={
                "spark.sql.shuffle.partitions": 200,
                "spark.executor.memoryOverhead": "1g",
                # We add additional jars from maven
                "spark.jars.packages": "org.apache.hadoop:hadoop-azure-datalake:3.3.4,org.apache.hadoop:hadoop-azure:3.3.4,",
            }
        )
        data = pd.DataFrame({"random_number": [random_number]})
        df = spark.createDataFrame(data)
        df.show()

        sc = spark._jsc.sc()
        executors = [
            executor.host() for executor in sc.statusTracker().getExecutorInfos()
        ]
        print(f"Executors: {executors}")

        time.sleep(60)

    # Instead of using the taskflow API, you can also use an operator and specify a script to
    # be executed.
    t1 = STACKITSparkScriptOperator(
        task_id="spark_operator",
        # You can use here all parameters that you already learned about above.
        cpu=2,
        # Specify a relative path here from the root of the DAGs Repository
        script="Demo/scripts/my_spark_job.py",
        **default_kwargs,
    )

    # When using the StackITSparkScriptOperator, the script is run in the context of
    # the git repository it is contained in. This means you can use imports inside
    # of scripts!
    # Per default, the script is expected to be in the same repository as this DAG.
    # If it is in another repository, you can specify the git_* arguments of the
    # operator. Check the `05-spark-full-configuration.py` for an example.
    t1 = STACKITSparkScriptOperator(
        task_id="spark_operator_with_imports",
        # You can use here all parameters that you already learned about above.
        cpu=2,
        # Specify a relative path here from the root of the DAGs Repository
        script="Demo/scripts/my_spark_job_with_imports.py",
        **default_kwargs,
    )

    # Even when using your own image, you can still use this operator to execute
    # a python script in any git repository in the context of your image. You
    # won't be able to use the `stackit_spark` module in this case.
    # `python` needs to be installed in the image.
    t1 = STACKITSparkScriptOperator(
        task_id="run_script_in_custom_image",
        # You can use here all parameters that you already learned about above.
        # This will still set the resource requests of your pod!
        cpu=2,
        # Specify a relative path here from the root of the DAGs Repository
        script="Demo/scripts/basic_python_script.py",
        image="python:3.12-slim-bullseye",
        # All pods must run as non-root users. We recommend to use images
        # that already run as non-root users.
        security_context={"runAsUser": 1000},
    )

    random_number = generate_number()
    spark_single_node(random_number)
    spark_single_node_large(random_number)
    spark_cluster(random_number)
    random_number >> t1


simple_spark()