Initial commit

2026-05-28 17:44:11 +02:00 · 2026-05-28 17:44:11 +02:00 · a578239c4f
commit a578239c4f
parent d4028fca11
32 changed files with 2559 additions and 0 deletions
--- a/Demo/stackit-15-data-assets.py
+++ b/Demo/stackit-15-data-assets.py
@ -0,0 +1,141 @@
+"""
+Demo: Data Assets (formerly "Datasets") in Apache Airflow 3
+
+Data Assets let you create data-driven scheduling: one DAG produces/updates
+an asset, and another DAG is automatically triggered when that asset changes.
+
+This file defines:
+  - A "producer" DAG that updates two data assets
+  - A "consumer" DAG that is scheduled to run whenever BOTH assets are updated
+  - A "single asset consumer" DAG triggered by a single asset update
+
+Key concepts:
+  - Asset: Represents a logical data artifact (file, table, topic, etc.)
+  - schedule=<Asset(...)>: Triggers the DAG when the asset is updated
+  - outlet: Marks a task as producing/updating an asset
+
+Note on Data Assets vs. S3/external Triggers:
+  - Data Assets are for Airflow-internal coordination: DAG A produces data,
+    DAG B reacts. No polling, no cloud API calls, zero cost.
+  - S3KeySensor / external triggers are for watching files created by
+    processes OUTSIDE of Airflow (e.g., a third-party ETL tool drops a file
+    into S3). Those do involve polling and incur API costs.
+
+See: https://airflow.apache.org/docs/apache-airflow/stable/authoring-and-scheduling/datasets.html
+"""
+
+import json
+import pendulum
+
+from airflow.sdk import Asset, dag, task
+
+
+# ---------------------------------------------------------------------------
+# Data Assets
+#
+# Assets are identified by a URI. The URI is purely a LOGICAL LABEL –
+# Airflow never actually connects to, reads, or polls this URI!
+# It is only used as a key to link producer tasks to consumer DAGs.
+#
+# When a producer task (with outlets=[...]) completes successfully, the
+# Airflow scheduler internally records "this asset was updated" in its
+# metadata database. Consumer DAGs scheduled on that asset are then
+# triggered automatically – entirely internal, zero external API calls,
+# no polling, no cloud costs.
+#
+# The URI could just as well be "my-orders" or "daily-report" – using
+# URIs like "s3://..." is just a convention to document where the data
+# actually lives, but Airflow itself ignores the scheme/path.
+# ---------------------------------------------------------------------------
+orders_asset = Asset("s3://my-stackit-bucket/orders/daily.parquet")
+customers_asset = Asset("s3://my-stackit-bucket/customers/latest.csv")
+
+
+# ===========================================================================
+# Producer DAG – updates both assets
+# ===========================================================================
+@dag(
+    schedule=None,  # triggered manually or by an external system
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    catchup=False,
+    tags=["demo", "data-assets"],
+    dag_id="stackit_15a_data_assets_producer",
+)
+def data_assets_producer():
+    @task(outlets=[orders_asset])
+    def produce_orders():
+        """Simulate producing order data and updating the orders asset."""
+        orders = [
+            {"order_id": 1001, "amount": 250.00, "customer": "Alice"},
+            {"order_id": 1002, "amount": 125.50, "customer": "Bob"},
+            {"order_id": 1003, "amount": 340.75, "customer": "Charlie"},
+        ]
+        # In a real scenario you would write to S3, a database, etc.
+        print(f"Produced {len(orders)} orders:")
+        print(json.dumps(orders, indent=2))
+        return orders
+
+    @task(outlets=[customers_asset])
+    def produce_customers():
+        """Simulate producing customer data and updating the customers asset."""
+        customers = [
+            {"name": "Alice", "email": "alice@example.com"},
+            {"name": "Bob", "email": "bob@example.com"},
+            {"name": "Charlie", "email": "charlie@example.com"},
+        ]
+        print(f"Produced {len(customers)} customers:")
+        print(json.dumps(customers, indent=2))
+        return customers
+
+    produce_orders() >> produce_customers()
+
+
+data_assets_producer()
+
+
+# ===========================================================================
+# Consumer DAG 1 – triggered when the orders asset is updated
+# ===========================================================================
+@dag(
+    schedule=orders_asset,  # runs whenever orders_asset is updated
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    catchup=False,
+    tags=["demo", "data-assets"],
+    dag_id="stackit_15b_data_assets_consumer_orders",
+)
+def data_assets_consumer_orders():
+    @task()
+    def process_orders():
+        """React to new order data being available."""
+        print("New order data detected! Running order processing pipeline...")
+        print("Aggregating daily totals...")
+        print("Order processing complete.")
+
+    process_orders()
+
+
+data_assets_consumer_orders()
+
+
+# ===========================================================================
+# Consumer DAG 2 – triggered when BOTH assets have been updated
+# ===========================================================================
+@dag(
+    schedule=(orders_asset & customers_asset),  # requires BOTH assets to be updated
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    catchup=False,
+    tags=["demo", "data-assets"],
+    dag_id="stackit_15c_data_assets_consumer_combined",
+)
+def data_assets_consumer_combined():
+    @task()
+    def join_orders_and_customers():
+        """React when both orders and customers data are refreshed."""
+        print("Both orders AND customers data updated!")
+        print("Joining datasets for the combined analytics report...")
+        print("Combined report generated successfully.")
+
+    join_orders_and_customers()
+
+
+data_assets_consumer_combined()