109 lines
4.7 KiB
Python
109 lines
4.7 KiB
Python
import pendulum
|
|
import os
|
|
|
|
from airflow.sdk import dag, task
|
|
from airflow.models.connection import Connection
|
|
from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator
|
|
from stackit_workflows import config
|
|
|
|
# Airflow Connection "s3_papermill_output" must be configured to write notebook output to S3:
|
|
# host = <S3 endpoint URL> (used as BOTO3_ENDPOINT_URL)
|
|
# login = <AWS_ACCESS_KEY_ID>
|
|
# password = <AWS_SECRET_ACCESS_KEY>
|
|
try:
|
|
s3_connection = Connection.get_connection_from_secrets("s3_papermill_output")
|
|
except Exception:
|
|
import warnings
|
|
warnings.warn(
|
|
"Airflow connection 's3_papermill_output' is not configured. "
|
|
"This DAG will be visible but cannot be run until the connection is created. "
|
|
"Please create an Airflow connection with conn_id='s3_papermill_output': "
|
|
"host=<S3 endpoint URL>, login=<AWS_ACCESS_KEY_ID>, password=<AWS_SECRET_ACCESS_KEY>.",
|
|
stacklevel=2,
|
|
)
|
|
s3_connection = None
|
|
|
|
@dag(
|
|
schedule="0 7 * * *",
|
|
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
|
catchup=False,
|
|
tags=["demo","stackit-demo"],
|
|
dag_id="stackit_13_demo_sales_usecase",
|
|
)
|
|
def spark_full_configuration():
|
|
|
|
if s3_connection is None:
|
|
# S3 connection not available — define a placeholder task so the DAG
|
|
# can still be imported and inspected.
|
|
@task()
|
|
def setup_required():
|
|
raise RuntimeError(
|
|
"Airflow connection 's3_papermill_output' is not configured. "
|
|
"Please create it with: host=<S3 endpoint URL>, "
|
|
"login=<AWS_ACCESS_KEY_ID>, password=<AWS_SECRET_ACCESS_KEY>."
|
|
)
|
|
setup_required()
|
|
return
|
|
|
|
output_s3 = STACKITSparkScriptOperator(
|
|
task_id="save_results_in_s3",
|
|
# image : str, optional
|
|
# Image of the launched container.
|
|
# If not specified, a current spark image maintained by stackit is used.
|
|
image="schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2",
|
|
# get_logs : bool, optional
|
|
# Read stdout get the stdout of the container as logs of the tasks. Default: True
|
|
get_logs=True,
|
|
# is_papermill : bool, optional
|
|
# Flag to define if the script defined in the `script` parameter
|
|
# is a jupyter notebook that is supposed to be executed using papermill.
|
|
# This parameter does not need to be set in order to use papermill. If
|
|
# it is not set the spark operator identifies notebooks automatically
|
|
# and uses papermill.
|
|
is_papermill=True,
|
|
# script : str
|
|
# Path of the script to execute. If a relative path is specified, it
|
|
# is interpreted relative to the git-sync folder.
|
|
# If an absolute path is specified it is not adapted further.
|
|
script="Demo/scripts/sales_prediction.ipynb",
|
|
# papermill_target_folder: str
|
|
# If papermill is supposed to be used, some target folder needs to be defined
|
|
# here in order to write the target notebook to the corresponding folder.
|
|
# If this is set to "-" the outpout is written to stdout.
|
|
papermill_target_folder="s3://internal-airflow-papermill-output/notebooks",
|
|
# s3_arguments : dict
|
|
# Arguments used to authenticate to S3. Should be used to pass
|
|
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and BOTO3_ENDPOINT_URL
|
|
# which are required for authentication to external s3 for pushing
|
|
# the results to.
|
|
s3_arguments={
|
|
"AWS_ACCESS_KEY_ID": s3_connection.login,
|
|
"AWS_SECRET_ACCESS_KEY": s3_connection.password,
|
|
"BOTO3_ENDPOINT_URL": s3_connection.host,
|
|
},
|
|
# dremio_connections : list of str, optional
|
|
# List of connection ids that are used to configure spark.
|
|
# The "lakehouse" connection is pre-configured and connects to
|
|
# the same lakehouse as Dremio.
|
|
dremio_connections=["lakehouse-rest"],
|
|
)
|
|
|
|
output_stdout = STACKITSparkScriptOperator(
|
|
task_id="run_prediction",
|
|
image="schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2",
|
|
get_logs=True,
|
|
is_papermill=True,
|
|
script="Demo/scripts/sales_prediction.ipynb",
|
|
papermill_target_folder="-",
|
|
s3_arguments={
|
|
"AWS_ACCESS_KEY_ID": s3_connection.login,
|
|
"AWS_SECRET_ACCESS_KEY": s3_connection.password,
|
|
"BOTO3_ENDPOINT_URL": s3_connection.host,
|
|
},
|
|
dremio_connections=["lakehouse-rest"],
|
|
)
|
|
|
|
output_stdout >> output_s3
|
|
|
|
|
|
spark_full_configuration()
|