Initial commit
This commit is contained in:
parent
d4028fca11
commit
a578239c4f
32 changed files with 2559 additions and 0 deletions
149
Demo/stackit-04-simple-spark.py
Normal file
149
Demo/stackit-04-simple-spark.py
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
import pendulum
|
||||
|
||||
from airflow.sdk import dag, task
|
||||
from stackit_workflows.airflow_plugin.decorators import stackit
|
||||
from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator
|
||||
|
||||
# It is good practice but not required to specify the image to use.
|
||||
default_kwargs = {
|
||||
"image": "schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2"
|
||||
}
|
||||
|
||||
|
||||
@dag(
|
||||
schedule=None,
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
catchup=False,
|
||||
tags=["demo","stackit-demo"],
|
||||
dag_id="stackit_04_simple_spark",
|
||||
)
|
||||
def simple_spark():
|
||||
# This is a regular task using the airflow taskflow API
|
||||
@task()
|
||||
def generate_number():
|
||||
import random
|
||||
|
||||
return random.randint(1, 100)
|
||||
|
||||
# This is our own provider which makes it easy to run spark jobs from airflow.
|
||||
# By default, this launches a single node spark with 1 CPU and 6 GB RAM.
|
||||
@stackit.spark_kubernetes_task(**default_kwargs)
|
||||
def spark_single_node(random_number: int):
|
||||
import stackit_spark
|
||||
import pandas as pd
|
||||
|
||||
spark = stackit_spark.get_spark()
|
||||
df = spark.createDataFrame(pd.DataFrame({"random_number": [random_number]}))
|
||||
df.show()
|
||||
|
||||
# If we need more resources, we can specify them with the `cpu` and `memory_gb` parameters.
|
||||
# By default, memory is calculated as: Memory in GB = 6 * CPU.
|
||||
# Memory can also be specified explicitly.
|
||||
@stackit.spark_kubernetes_task(cpu=3, **default_kwargs)
|
||||
def spark_single_node_large(random_number: int):
|
||||
import stackit_spark
|
||||
import pandas as pd
|
||||
import time
|
||||
|
||||
spark = stackit_spark.get_spark()
|
||||
data = pd.DataFrame({"random_number": [random_number]})
|
||||
df = spark.createDataFrame(data)
|
||||
df.show()
|
||||
time.sleep(60)
|
||||
|
||||
# You can optionally specify the resources and also the number of executors.
|
||||
# `cpu` and `memory_gb` set resources for the driver, `executor_cpu` and `executor_memory_gb`
|
||||
# set resources for the executors.
|
||||
# If memory is not specified, it is calculated based on the specified CPUs. (1 CPU = 6 GB RAM)
|
||||
# `executors` sets the number of executors.
|
||||
# This uses by default a stackit provided spark image. It can be overridden.
|
||||
|
||||
# For many tasks one spark node is enough.
|
||||
# However, for very large transformations, we might need to spin up a cluster.
|
||||
#
|
||||
# Take this step only if you need more than ~16 CPUs and 128 GB RAM.
|
||||
# Running Spark in cluster node can sometimes even hurt performance as
|
||||
# data needs to be shuffled between nodes - and Network IO is often the bottleneck.
|
||||
#
|
||||
# `cpu` and `memory_gb` can still be used to configure the driver.
|
||||
# Executors are configured with `executor_cpu` and `executor_memory_gb`.
|
||||
# The number of executors can be set with `executors`. Each executor
|
||||
# gets the resources specified in `executor_cpu` and `executor_memory_gb`.
|
||||
@stackit.spark_kubernetes_task(cpu=2, executors=3, executor_cpu=2, **default_kwargs)
|
||||
def spark_cluster(random_number: int):
|
||||
import stackit_spark
|
||||
import pandas as pd
|
||||
import time
|
||||
|
||||
# You can customize spark here:
|
||||
spark = stackit_spark.get_spark(
|
||||
additional_config={
|
||||
"spark.sql.shuffle.partitions": 200,
|
||||
"spark.executor.memoryOverhead": "1g",
|
||||
# We add additional jars from maven
|
||||
"spark.jars.packages": "org.apache.hadoop:hadoop-azure-datalake:3.3.4,org.apache.hadoop:hadoop-azure:3.3.4,",
|
||||
}
|
||||
)
|
||||
data = pd.DataFrame({"random_number": [random_number]})
|
||||
df = spark.createDataFrame(data)
|
||||
df.show()
|
||||
|
||||
sc = spark._jsc.sc()
|
||||
executors = [
|
||||
executor.host() for executor in sc.statusTracker().getExecutorInfos()
|
||||
]
|
||||
print(f"Executors: {executors}")
|
||||
|
||||
time.sleep(60)
|
||||
|
||||
# Instead of using the taskflow API, you can also use an operator and specify a script to
|
||||
# be executed.
|
||||
t1 = STACKITSparkScriptOperator(
|
||||
task_id="spark_operator",
|
||||
# You can use here all parameters that you already learned about above.
|
||||
cpu=2,
|
||||
# Specify a relative path here from the root of the DAGs Repository
|
||||
script="Demo/scripts/my_spark_job.py",
|
||||
**default_kwargs,
|
||||
)
|
||||
|
||||
# When using the StackITSparkScriptOperator, the script is run in the context of
|
||||
# the git repository it is contained in. This means you can use imports inside
|
||||
# of scripts!
|
||||
# Per default, the script is expected to be in the same repository as this DAG.
|
||||
# If it is in another repository, you can specify the git_* arguments of the
|
||||
# operator. Check the `05-spark-full-configuration.py` for an example.
|
||||
t1 = STACKITSparkScriptOperator(
|
||||
task_id="spark_operator_with_imports",
|
||||
# You can use here all parameters that you already learned about above.
|
||||
cpu=2,
|
||||
# Specify a relative path here from the root of the DAGs Repository
|
||||
script="Demo/scripts/my_spark_job_with_imports.py",
|
||||
**default_kwargs,
|
||||
)
|
||||
|
||||
# Even when using your own image, you can still use this operator to execute
|
||||
# a python script in any git repository in the context of your image. You
|
||||
# won't be able to use the `stackit_spark` module in this case.
|
||||
# `python` needs to be installed in the image.
|
||||
t1 = STACKITSparkScriptOperator(
|
||||
task_id="run_script_in_custom_image",
|
||||
# You can use here all parameters that you already learned about above.
|
||||
# This will still set the resource requests of your pod!
|
||||
cpu=2,
|
||||
# Specify a relative path here from the root of the DAGs Repository
|
||||
script="Demo/scripts/basic_python_script.py",
|
||||
image="python:3.12-slim-bullseye",
|
||||
# All pods must run as non-root users. We recommend to use images
|
||||
# that already run as non-root users.
|
||||
security_context={"runAsUser": 1000},
|
||||
)
|
||||
|
||||
random_number = generate_number()
|
||||
spark_single_node(random_number)
|
||||
spark_single_node_large(random_number)
|
||||
spark_cluster(random_number)
|
||||
random_number >> t1
|
||||
|
||||
|
||||
simple_spark()
|
||||
Loading…
Add table
Add a link
Reference in a new issue