import pendulum from airflow.sdk import dag, task from stackit_workflows.airflow_plugin.decorators import stackit from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator # It is good practice but not required to specify the image to use. default_kwargs = { "image": "schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2" } @dag( schedule=None, start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=["demo","stackit-demo"], dag_id="stackit_04_simple_spark", ) def simple_spark(): # This is a regular task using the airflow taskflow API @task() def generate_number(): import random return random.randint(1, 100) # This is our own provider which makes it easy to run spark jobs from airflow. # By default, this launches a single node spark with 1 CPU and 6 GB RAM. @stackit.spark_kubernetes_task(**default_kwargs) def spark_single_node(random_number: int): import stackit_spark import pandas as pd spark = stackit_spark.get_spark() df = spark.createDataFrame(pd.DataFrame({"random_number": [random_number]})) df.show() # If we need more resources, we can specify them with the `cpu` and `memory_gb` parameters. # By default, memory is calculated as: Memory in GB = 6 * CPU. # Memory can also be specified explicitly. @stackit.spark_kubernetes_task(cpu=3, **default_kwargs) def spark_single_node_large(random_number: int): import stackit_spark import pandas as pd import time spark = stackit_spark.get_spark() data = pd.DataFrame({"random_number": [random_number]}) df = spark.createDataFrame(data) df.show() time.sleep(60) # You can optionally specify the resources and also the number of executors. # `cpu` and `memory_gb` set resources for the driver, `executor_cpu` and `executor_memory_gb` # set resources for the executors. # If memory is not specified, it is calculated based on the specified CPUs. (1 CPU = 6 GB RAM) # `executors` sets the number of executors. # This uses by default a stackit provided spark image. It can be overridden. # For many tasks one spark node is enough. # However, for very large transformations, we might need to spin up a cluster. # # Take this step only if you need more than ~16 CPUs and 128 GB RAM. # Running Spark in cluster node can sometimes even hurt performance as # data needs to be shuffled between nodes - and Network IO is often the bottleneck. # # `cpu` and `memory_gb` can still be used to configure the driver. # Executors are configured with `executor_cpu` and `executor_memory_gb`. # The number of executors can be set with `executors`. Each executor # gets the resources specified in `executor_cpu` and `executor_memory_gb`. @stackit.spark_kubernetes_task(cpu=2, executors=3, executor_cpu=2, **default_kwargs) def spark_cluster(random_number: int): import stackit_spark import pandas as pd import time # You can customize spark here: spark = stackit_spark.get_spark( additional_config={ "spark.sql.shuffle.partitions": 200, "spark.executor.memoryOverhead": "1g", # We add additional jars from maven "spark.jars.packages": "org.apache.hadoop:hadoop-azure-datalake:3.3.4,org.apache.hadoop:hadoop-azure:3.3.4,", } ) data = pd.DataFrame({"random_number": [random_number]}) df = spark.createDataFrame(data) df.show() sc = spark._jsc.sc() executors = [ executor.host() for executor in sc.statusTracker().getExecutorInfos() ] print(f"Executors: {executors}") time.sleep(60) # Instead of using the taskflow API, you can also use an operator and specify a script to # be executed. t1 = STACKITSparkScriptOperator( task_id="spark_operator", # You can use here all parameters that you already learned about above. cpu=2, # Specify a relative path here from the root of the DAGs Repository script="Demo/scripts/my_spark_job.py", **default_kwargs, ) # When using the StackITSparkScriptOperator, the script is run in the context of # the git repository it is contained in. This means you can use imports inside # of scripts! # Per default, the script is expected to be in the same repository as this DAG. # If it is in another repository, you can specify the git_* arguments of the # operator. Check the `05-spark-full-configuration.py` for an example. t1 = STACKITSparkScriptOperator( task_id="spark_operator_with_imports", # You can use here all parameters that you already learned about above. cpu=2, # Specify a relative path here from the root of the DAGs Repository script="Demo/scripts/my_spark_job_with_imports.py", **default_kwargs, ) # Even when using your own image, you can still use this operator to execute # a python script in any git repository in the context of your image. You # won't be able to use the `stackit_spark` module in this case. # `python` needs to be installed in the image. t1 = STACKITSparkScriptOperator( task_id="run_script_in_custom_image", # You can use here all parameters that you already learned about above. # This will still set the resource requests of your pod! cpu=2, # Specify a relative path here from the root of the DAGs Repository script="Demo/scripts/basic_python_script.py", image="python:3.12-slim-bullseye", # All pods must run as non-root users. We recommend to use images # that already run as non-root users. security_context={"runAsUser": 1000}, ) random_number = generate_number() spark_single_node(random_number) spark_single_node_large(random_number) spark_cluster(random_number) random_number >> t1 simple_spark()