Initial commit
This commit is contained in:
parent
d4028fca11
commit
a578239c4f
32 changed files with 2559 additions and 0 deletions
108
Demo/stackit-10-notebook.py
Normal file
108
Demo/stackit-10-notebook.py
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
import pendulum
|
||||
|
||||
from airflow.sdk import dag, task
|
||||
from airflow.models.connection import Connection
|
||||
from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator
|
||||
from stackit_workflows import config
|
||||
|
||||
# Airflow Connection "s3_papermill_output" must be configured to write notebook output to S3:
|
||||
# host = <S3 endpoint URL> (used as BOTO3_ENDPOINT_URL)
|
||||
# login = <AWS_ACCESS_KEY_ID>
|
||||
# password = <AWS_SECRET_ACCESS_KEY>
|
||||
try:
|
||||
s3_connection = Connection.get_connection_from_secrets("s3_papermill_output")
|
||||
except Exception:
|
||||
import warnings
|
||||
warnings.warn(
|
||||
"Airflow connection 's3_papermill_output' is not configured. "
|
||||
"This DAG will be visible but cannot be run until the connection is created. "
|
||||
"Please create an Airflow connection with conn_id='s3_papermill_output': "
|
||||
"host=<S3 endpoint URL>, login=<AWS_ACCESS_KEY_ID>, password=<AWS_SECRET_ACCESS_KEY>.",
|
||||
stacklevel=2,
|
||||
)
|
||||
s3_connection = None
|
||||
|
||||
@dag(
|
||||
schedule=None,
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
catchup=False,
|
||||
tags=["demo","stackit-demo"],
|
||||
dag_id="stackit_10_jupyter_notebook",
|
||||
)
|
||||
def spark_full_configuration():
|
||||
|
||||
if s3_connection is None:
|
||||
# S3 connection not available — define a placeholder task so the DAG
|
||||
# can still be imported and inspected.
|
||||
@task()
|
||||
def setup_required():
|
||||
raise RuntimeError(
|
||||
"Airflow connection 's3_papermill_output' is not configured. "
|
||||
"Please create it with: host=<S3 endpoint URL>, "
|
||||
"login=<AWS_ACCESS_KEY_ID>, password=<AWS_SECRET_ACCESS_KEY>."
|
||||
)
|
||||
setup_required()
|
||||
return
|
||||
|
||||
output_s3 = STACKITSparkScriptOperator(
|
||||
task_id="papermill_s3",
|
||||
# image : str, optional
|
||||
# Image of the launched container.
|
||||
# If not specified, a current spark image maintained by stackit is used.
|
||||
image="schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2",
|
||||
# get_logs : bool, optional
|
||||
# Read stdout get the stdout of the container as logs of the tasks. Default: True
|
||||
get_logs=True,
|
||||
# is_papermill : bool, optional
|
||||
# Flag to define if the script defined in the `script` parameter
|
||||
# is a jupyter notebook that is supposed to be executed using papermill.
|
||||
# This parameter does not need to be set in order to use papermill. If
|
||||
# it is not set the spark operator identifies notebooks automatically
|
||||
# and uses papermill.
|
||||
is_papermill=True,
|
||||
# script : str
|
||||
# Path of the script to execute. If a relative path is specified, it
|
||||
# is interpreted relative to the git-sync folder.
|
||||
# If an absolute path is specified it is not adapted further.
|
||||
script="Demo/scripts/my_spark_notebook.ipynb",
|
||||
# papermill_target_folder: str
|
||||
# If papermill is supposed to be used, some target folder needs to be defined
|
||||
# here in order to write the target notebook to the corresponding folder.
|
||||
# If this is set to "-" the outpout is written to stdout.
|
||||
papermill_target_folder="s3://internal-airflow-papermill-output/notebooks",
|
||||
# s3_arguments : dict
|
||||
# Arguments used to authenticate to S3. Should be used to pass
|
||||
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and BOTO3_ENDPOINT_URL
|
||||
# which are required for authentication to external s3 for pushing
|
||||
# the results to.
|
||||
s3_arguments={
|
||||
"AWS_ACCESS_KEY_ID": s3_connection.login,
|
||||
"AWS_SECRET_ACCESS_KEY": s3_connection.password,
|
||||
"BOTO3_ENDPOINT_URL": s3_connection.host,
|
||||
},
|
||||
# dremio_connections : list of str, optional
|
||||
# List of connection ids that are used to configure spark.
|
||||
# The "lakehouse" connection is pre-configured and connects to
|
||||
# the same lakehouse as Dremio.
|
||||
dremio_connections=["lakehouse-rest"],
|
||||
)
|
||||
|
||||
output_stdout = STACKITSparkScriptOperator(
|
||||
task_id="papermill_stdout",
|
||||
image="schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2",
|
||||
get_logs=True,
|
||||
is_papermill=True,
|
||||
script="Demo/scripts/my_spark_notebook.ipynb",
|
||||
papermill_target_folder="-",
|
||||
s3_arguments={
|
||||
"AWS_ACCESS_KEY_ID": s3_connection.login,
|
||||
"AWS_SECRET_ACCESS_KEY": s3_connection.password,
|
||||
"BOTO3_ENDPOINT_URL": s3_connection.host,
|
||||
},
|
||||
dremio_connections=["lakehouse-rest"],
|
||||
)
|
||||
|
||||
output_stdout >> output_s3
|
||||
|
||||
|
||||
spark_full_configuration()
|
||||
Loading…
Add table
Add a link
Reference in a new issue