Initial commit

2026-05-28 17:44:11 +02:00 · 2026-05-28 17:44:11 +02:00 · a578239c4f
commit a578239c4f
parent d4028fca11
32 changed files with 2559 additions and 0 deletions
--- a/Demo/stackit-04-simple-spark.py
+++ b/Demo/stackit-04-simple-spark.py
@ -0,0 +1,149 @@
+import pendulum
+
+from airflow.sdk import dag, task
+from stackit_workflows.airflow_plugin.decorators import stackit
+from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator
+
+# It is good practice but not required to specify the image to use.
+default_kwargs = {
+    "image": "schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2"
+}
+
+
+@dag(
+    schedule=None,
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    catchup=False,
+    tags=["demo","stackit-demo"],
+    dag_id="stackit_04_simple_spark",
+)
+def simple_spark():
+    # This is a regular task using the airflow taskflow API
+    @task()
+    def generate_number():
+        import random
+
+        return random.randint(1, 100)
+
+    # This is our own provider which makes it easy to run spark jobs from airflow.
+    # By default, this launches a single node spark with 1 CPU and 6 GB RAM.
+    @stackit.spark_kubernetes_task(**default_kwargs)
+    def spark_single_node(random_number: int):
+        import stackit_spark
+        import pandas as pd
+
+        spark = stackit_spark.get_spark()
+        df = spark.createDataFrame(pd.DataFrame({"random_number": [random_number]}))
+        df.show()
+
+    # If we need more resources, we can specify them with the `cpu` and `memory_gb` parameters.
+    # By default, memory is calculated as: Memory in GB = 6 * CPU.
+    # Memory can also be specified explicitly.
+    @stackit.spark_kubernetes_task(cpu=3, **default_kwargs)
+    def spark_single_node_large(random_number: int):
+        import stackit_spark
+        import pandas as pd
+        import time
+
+        spark = stackit_spark.get_spark()
+        data = pd.DataFrame({"random_number": [random_number]})
+        df = spark.createDataFrame(data)
+        df.show()
+        time.sleep(60)
+
+    # You can optionally specify the resources and also the number of executors.
+    # `cpu` and `memory_gb` set resources for the driver, `executor_cpu` and `executor_memory_gb` 
+    # set resources for the executors.
+    # If memory is not specified, it is calculated based on the specified CPUs. (1 CPU = 6 GB RAM)
+    # `executors` sets the number of executors.
+    # This uses by default a stackit provided spark image. It can be overridden.
+
+    # For many tasks one spark node is enough.
+    # However, for very large transformations, we might need to spin up a cluster.
+    #
+    # Take this step only if you need more than ~16 CPUs and 128 GB RAM.
+    # Running Spark in cluster node can sometimes even hurt performance as
+    # data needs to be shuffled between nodes - and Network IO is often the bottleneck.
+    #
+    # `cpu` and `memory_gb` can still be used to configure the driver.
+    # Executors are configured with `executor_cpu` and `executor_memory_gb`.
+    # The number of executors can be set with `executors`. Each executor
+    # gets the resources specified in `executor_cpu` and `executor_memory_gb`.
+    @stackit.spark_kubernetes_task(cpu=2, executors=3, executor_cpu=2, **default_kwargs)
+    def spark_cluster(random_number: int):
+        import stackit_spark
+        import pandas as pd
+        import time
+
+        # You can customize spark here:
+        spark = stackit_spark.get_spark(
+            additional_config={
+                "spark.sql.shuffle.partitions": 200,
+                "spark.executor.memoryOverhead": "1g",
+                # We add additional jars from maven
+                "spark.jars.packages": "org.apache.hadoop:hadoop-azure-datalake:3.3.4,org.apache.hadoop:hadoop-azure:3.3.4,",
+            }
+        )
+        data = pd.DataFrame({"random_number": [random_number]})
+        df = spark.createDataFrame(data)
+        df.show()
+
+        sc = spark._jsc.sc()
+        executors = [
+            executor.host() for executor in sc.statusTracker().getExecutorInfos()
+        ]
+        print(f"Executors: {executors}")
+
+        time.sleep(60)
+
+    # Instead of using the taskflow API, you can also use an operator and specify a script to
+    # be executed.
+    t1 = STACKITSparkScriptOperator(
+        task_id="spark_operator",
+        # You can use here all parameters that you already learned about above.
+        cpu=2,
+        # Specify a relative path here from the root of the DAGs Repository
+        script="Demo/scripts/my_spark_job.py",
+        **default_kwargs,
+    )
+
+    # When using the StackITSparkScriptOperator, the script is run in the context of
+    # the git repository it is contained in. This means you can use imports inside
+    # of scripts!
+    # Per default, the script is expected to be in the same repository as this DAG.
+    # If it is in another repository, you can specify the git_* arguments of the
+    # operator. Check the `05-spark-full-configuration.py` for an example.
+    t1 = STACKITSparkScriptOperator(
+        task_id="spark_operator_with_imports",
+        # You can use here all parameters that you already learned about above.
+        cpu=2,
+        # Specify a relative path here from the root of the DAGs Repository
+        script="Demo/scripts/my_spark_job_with_imports.py",
+        **default_kwargs,
+    )
+
+    # Even when using your own image, you can still use this operator to execute
+    # a python script in any git repository in the context of your image. You
+    # won't be able to use the `stackit_spark` module in this case.
+    # `python` needs to be installed in the image.
+    t1 = STACKITSparkScriptOperator(
+        task_id="run_script_in_custom_image",
+        # You can use here all parameters that you already learned about above.
+        # This will still set the resource requests of your pod!
+        cpu=2,
+        # Specify a relative path here from the root of the DAGs Repository
+        script="Demo/scripts/basic_python_script.py",
+        image="python:3.12-slim-bullseye",
+        # All pods must run as non-root users. We recommend to use images
+        # that already run as non-root users.
+        security_context={"runAsUser": 1000},
+    )
+
+    random_number = generate_number()
+    spark_single_node(random_number)
+    spark_single_node_large(random_number)
+    spark_cluster(random_number)
+    random_number >> t1
+
+
+simple_spark()