Initial commit

2026-05-28 17:44:11 +02:00 · 2026-05-28 17:44:11 +02:00 · a578239c4f
commit a578239c4f
parent d4028fca11
32 changed files with 2559 additions and 0 deletions
--- a/Demo/stackit-10-notebook.py
+++ b/Demo/stackit-10-notebook.py
@ -0,0 +1,108 @@
+import pendulum
+
+from airflow.sdk import dag, task
+from airflow.models.connection import Connection
+from stackit_workflows.airflow_plugin.operators import STACKITSparkScriptOperator
+from stackit_workflows import config
+
+# Airflow Connection "s3_papermill_output" must be configured to write notebook output to S3:
+#   host     = <S3 endpoint URL>          (used as BOTO3_ENDPOINT_URL)
+#   login    = <AWS_ACCESS_KEY_ID>
+#   password = <AWS_SECRET_ACCESS_KEY>
+try:
+    s3_connection = Connection.get_connection_from_secrets("s3_papermill_output")
+except Exception:
+    import warnings
+    warnings.warn(
+        "Airflow connection 's3_papermill_output' is not configured. "
+        "This DAG will be visible but cannot be run until the connection is created. "
+        "Please create an Airflow connection with conn_id='s3_papermill_output': "
+        "host=<S3 endpoint URL>, login=<AWS_ACCESS_KEY_ID>, password=<AWS_SECRET_ACCESS_KEY>.",
+        stacklevel=2,
+    )
+    s3_connection = None
+
+@dag(
+    schedule=None,
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    catchup=False,
+    tags=["demo","stackit-demo"],
+    dag_id="stackit_10_jupyter_notebook",
+)
+def spark_full_configuration():
+
+    if s3_connection is None:
+        # S3 connection not available — define a placeholder task so the DAG
+        # can still be imported and inspected.
+        @task()
+        def setup_required():
+            raise RuntimeError(
+                "Airflow connection 's3_papermill_output' is not configured. "
+                "Please create it with: host=<S3 endpoint URL>, "
+                "login=<AWS_ACCESS_KEY_ID>, password=<AWS_SECRET_ACCESS_KEY>."
+            )
+        setup_required()
+        return
+
+    output_s3 = STACKITSparkScriptOperator(
+        task_id="papermill_s3",
+        # image : str, optional
+        #     Image of the launched container.
+        #     If not specified, a current spark image maintained by stackit is used.
+        image="schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2",
+        # get_logs : bool, optional
+        #     Read stdout get the stdout of the container as logs of the tasks. Default: True
+        get_logs=True,
+        # is_papermill : bool, optional
+        #     Flag to define if the script defined in the `script` parameter
+        #     is a jupyter notebook that is supposed to be executed using papermill.
+        #     This parameter does not need to be set in order to use papermill. If
+        #     it is not set the spark operator identifies notebooks automatically
+        #     and uses papermill.
+        is_papermill=True,
+        # script : str
+        #     Path of the script to execute. If a relative path is specified, it
+        #     is interpreted relative to the git-sync folder.
+        #     If an absolute path is specified it is not adapted further.
+        script="Demo/scripts/my_spark_notebook.ipynb",
+        # papermill_target_folder: str
+        #     If papermill is supposed to be used, some target folder needs to be defined
+        #     here in order to write the target notebook to the corresponding folder.
+        #     If this is set to "-" the outpout is written to stdout.
+        papermill_target_folder="s3://internal-airflow-papermill-output/notebooks",
+        # s3_arguments : dict
+        #     Arguments used to authenticate to S3. Should be used to pass 
+        #     AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and BOTO3_ENDPOINT_URL
+        #     which are required for authentication to external s3 for pushing
+        #     the results to.
+        s3_arguments={
+            "AWS_ACCESS_KEY_ID": s3_connection.login,
+            "AWS_SECRET_ACCESS_KEY": s3_connection.password,
+            "BOTO3_ENDPOINT_URL": s3_connection.host,
+        },
+        # dremio_connections : list of str, optional
+        #     List of connection ids that are used to configure spark.
+        #     The "lakehouse" connection is pre-configured and connects to
+        #     the same lakehouse as Dremio.
+        dremio_connections=["lakehouse-rest"],
+    )
+
+    output_stdout = STACKITSparkScriptOperator(
+        task_id="papermill_stdout",
+        image="schwarzit-xx-sit-dp-customer-artifactory-docker-local.jfrog.io/stackit-spark:spark3.5.3-0.1.2",
+        get_logs=True,
+        is_papermill=True,
+        script="Demo/scripts/my_spark_notebook.ipynb",
+        papermill_target_folder="-",
+        s3_arguments={
+            "AWS_ACCESS_KEY_ID": s3_connection.login,
+            "AWS_SECRET_ACCESS_KEY": s3_connection.password,
+            "BOTO3_ENDPOINT_URL": s3_connection.host,
+        },
+        dremio_connections=["lakehouse-rest"],
+    )
+
+    output_stdout >> output_s3
+
+
+spark_full_configuration()