workflows-example-dags/Demo/scripts/sales_prediction.ipynb
2026-05-28 17:44:11 +02:00

182 lines
6.5 KiB
Text

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "8aae8ae7-4cb4-4b66-b325-aa3eabdeb455",
"metadata": {},
"outputs": [],
"source": [
"run_date = \"2025-05-08\"\n",
"input_sales = \"local.retail_sales\"\n",
"input_inventory = \"local.retail_inventory\"\n",
"output_table = \"local.restock_plan\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3598df66-6027-4f02-b7bd-c279b9d20c32",
"metadata": {},
"outputs": [],
"source": [
"# create spark session with authentication to the catalog\n",
"\n",
"def get_catalog_token(tokenendpoint, password):\n",
" import requests\n",
"\n",
" # Exchange Dremio PAT to Nessie token\n",
" token_request_body = {\n",
" \"grant_type\": \"urn:ietf:params:oauth:grant-type:token-exchange\",\n",
" \"scope\": \"dremio.all\",\n",
" \"subject_token_type\": \"urn:ietf:params:oauth:token-type:dremio:personal-access-token\",\n",
" \"subject_token\": password,\n",
" }\n",
" x = requests.post(tokenendpoint, data=token_request_body)\n",
" x.raise_for_status()\n",
" return x.json()[\"access_token\"]\n",
"\n",
"\n",
"def get_spark_session(host, catalog_token):\n",
" import stackit_spark\n",
"\n",
" catalog_name_in_spark = \"stackit\"\n",
" return stackit_spark.get_spark(\n",
" additional_config={\n",
" \"spark.jars.packages\": \"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}\": \"org.apache.iceberg.spark.SparkCatalog\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.warehouse\": \"catalog-s3\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.type\": \"rest\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.uri\": host,\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.token\": catalog_token,\n",
" }\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9071bfb-1767-47b3-907f-723ce4389d9e",
"metadata": {},
"outputs": [],
"source": [
"catalog_token = get_catalog_token(\"https://dremio-internal.data-platform.stackit.run/oauth/token\", \"JnHzzS1LRFeZw4HIJQNP+iGJereEuCehcZwyGwSxcZPSrX4H7NL6FGqOxf/lRw==\")\n",
"spark = get_spark_session(\"https://dremio-internal-catalog.data-platform.stackit.run/iceberg/main\", catalog_token)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c250ade-1565-4c95-b1ae-134724a2e77b",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(f\"USE stackit\")\n",
"# Step 1: Create a namespace\n",
"spark.sql(\"CREATE NAMESPACE IF NOT EXISTS retail_demo\")\n",
"\n",
"# Step 2: Create synthetic sales data\n",
"sales_df = spark.range(100).selectExpr(\n",
" \"date_add('2025-05-01', cast(rand() * 7 as int)) as sale_date\",\n",
" \"cast(rand() * 10 + 1 as int) as units_sold\",\n",
" \"case when cast(rand() * 3 as int) = 0 then 'A123' \"\n",
" \" when cast(rand() * 3 as int) = 1 then 'B456' \"\n",
" \" else 'C789' end as product_id\",\n",
" \"case when cast(rand() * 2 as int) = 0 then '101' else '102' end as store_id\"\n",
")\n",
"\n",
"# Step 3: Create synthetic inventory data\n",
"inventory_df = spark.range(10).selectExpr(\n",
" \"'2025-05-08' as inventory_date\",\n",
" \"cast(rand() * 20 + 10 as int) as current_stock\",\n",
" \"case when cast(rand() * 3 as int) = 0 then 'A123' \"\n",
" \" when cast(rand() * 3 as int) = 1 then 'B456' \"\n",
" \" else 'C789' end as product_id\",\n",
" \"case when cast(rand() * 2 as int) = 0 then '101' else '102' end as store_id\"\n",
")\n",
"\n",
"# Step 4: Write to Iceberg tables\n",
"sales_df.write.mode(\"overwrite\").saveAsTable(\"retail_demo.sales_data\")\n",
"inventory_df.write.mode(\"overwrite\").saveAsTable(\"retail_demo.inventory_data\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5bd53006-ddbf-4ecf-bf3b-fa173cf9893b",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql.functions import expr\n",
"\n",
"# Load sales and inventory data\n",
"sales = spark.sql(\"\"\"\n",
" SELECT store_id, product_id, sale_date, units_sold\n",
" FROM retail_demo.sales_data\n",
"\"\"\")\n",
"\n",
"inventory = spark.sql(\"\"\"\n",
" SELECT store_id, product_id, inventory_date, current_stock\n",
" FROM retail_demo.inventory_data\n",
"\"\"\")\n",
"\n",
"# Step 2: Aggregate demand per store/product (e.g. last 7 days)\n",
"demand = sales.groupBy(\"store_id\", \"product_id\") \\\n",
" .agg({\"units_sold\": \"avg\"}) \\\n",
" .withColumnRenamed(\"avg(units_sold)\", \"predicted_demand\")\n",
"\n",
"# Step 3: Join with inventory\n",
"restock_plan = demand.join(inventory, on=[\"store_id\", \"product_id\"], how=\"inner\")\n",
"\n",
"# Step 4: Calculate restock quantity\n",
"restock_plan = restock_plan.withColumn(\n",
" \"restock_qty\",\n",
" expr(\"CASE WHEN predicted_demand - current_stock > 0 THEN int(predicted_demand - current_stock) ELSE 0 END\")\n",
")\n",
"\n",
"# Step 5: Add metadata\n",
"restock_plan = restock_plan.withColumn(\"run_date\", expr(\"current_date()\"))\n",
"\n",
"# Step 6: Save to Iceberg\n",
"restock_plan.write.mode(\"overwrite\").saveAsTable(\"retail_demo.restock_plan\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5fea9096-b7b3-44d4-91aa-189ac9ea4a33",
"metadata": {},
"outputs": [],
"source": [
"# Convert the Spark DataFrame to a Pandas DataFrame for better display in Jupyter\n",
"restock_df = restock_plan.toPandas()\n",
"\n",
"# Display the restock plan nicely in Jupyter using HTML table format\n",
"import IPython.display as display\n",
"display.display(display.HTML(restock_df.to_html(index=False)))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}