workflows-example-dags/Demo/scripts/sales_prediction.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8aae8ae7-4cb4-4b66-b325-aa3eabdeb455",
   "metadata": {},
   "outputs": [],
   "source": [
    "run_date = \"2025-05-08\"\n",
    "input_sales = \"local.retail_sales\"\n",
    "input_inventory = \"local.retail_inventory\"\n",
    "output_table = \"local.restock_plan\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3598df66-6027-4f02-b7bd-c279b9d20c32",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create spark session with authentication to the catalog\n",
    "\n",
    "def get_catalog_token(tokenendpoint, password):\n",
    "    import requests\n",
    "\n",
    "    # Exchange Dremio PAT to Nessie token\n",
    "    token_request_body = {\n",
    "        \"grant_type\": \"urn:ietf:params:oauth:grant-type:token-exchange\",\n",
    "        \"scope\": \"dremio.all\",\n",
    "        \"subject_token_type\": \"urn:ietf:params:oauth:token-type:dremio:personal-access-token\",\n",
    "        \"subject_token\": password,\n",
    "    }\n",
    "    x = requests.post(tokenendpoint, data=token_request_body)\n",
    "    x.raise_for_status()\n",
    "    return x.json()[\"access_token\"]\n",
    "\n",
    "\n",
    "def get_spark_session(host, catalog_token):\n",
    "    import stackit_spark\n",
    "\n",
    "    catalog_name_in_spark = \"stackit\"\n",
    "    return stackit_spark.get_spark(\n",
    "        additional_config={\n",
    "            \"spark.jars.packages\": \"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1\",\n",
    "            f\"spark.sql.catalog.{catalog_name_in_spark}\": \"org.apache.iceberg.spark.SparkCatalog\",\n",
    "            f\"spark.sql.catalog.{catalog_name_in_spark}.warehouse\": \"catalog-s3\",\n",
    "            f\"spark.sql.catalog.{catalog_name_in_spark}.type\": \"rest\",\n",
    "            f\"spark.sql.catalog.{catalog_name_in_spark}.uri\": host,\n",
    "            f\"spark.sql.catalog.{catalog_name_in_spark}.token\": catalog_token,\n",
    "        }\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9071bfb-1767-47b3-907f-723ce4389d9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "catalog_token = get_catalog_token(\"https://dremio-internal.data-platform.stackit.run/oauth/token\", \"JnHzzS1LRFeZw4HIJQNP+iGJereEuCehcZwyGwSxcZPSrX4H7NL6FGqOxf/lRw==\")\n",
    "spark = get_spark_session(\"https://dremio-internal-catalog.data-platform.stackit.run/iceberg/main\", catalog_token)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c250ade-1565-4c95-b1ae-134724a2e77b",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(f\"USE stackit\")\n",
    "# Step 1: Create a namespace\n",
    "spark.sql(\"CREATE NAMESPACE IF NOT EXISTS retail_demo\")\n",
    "\n",
    "# Step 2: Create synthetic sales data\n",
    "sales_df = spark.range(100).selectExpr(\n",
    "    \"date_add('2025-05-01', cast(rand() * 7 as int)) as sale_date\",\n",
    "    \"cast(rand() * 10 + 1 as int) as units_sold\",\n",
    "    \"case when cast(rand() * 3 as int) = 0 then 'A123' \"\n",
    "    \"     when cast(rand() * 3 as int) = 1 then 'B456' \"\n",
    "    \"     else 'C789' end as product_id\",\n",
    "    \"case when cast(rand() * 2 as int) = 0 then '101' else '102' end as store_id\"\n",
    ")\n",
    "\n",
    "# Step 3: Create synthetic inventory data\n",
    "inventory_df = spark.range(10).selectExpr(\n",
    "    \"'2025-05-08' as inventory_date\",\n",
    "    \"cast(rand() * 20 + 10 as int) as current_stock\",\n",
    "    \"case when cast(rand() * 3 as int) = 0 then 'A123' \"\n",
    "    \"     when cast(rand() * 3 as int) = 1 then 'B456' \"\n",
    "    \"     else 'C789' end as product_id\",\n",
    "    \"case when cast(rand() * 2 as int) = 0 then '101' else '102' end as store_id\"\n",
    ")\n",
    "\n",
    "# Step 4: Write to Iceberg tables\n",
    "sales_df.write.mode(\"overwrite\").saveAsTable(\"retail_demo.sales_data\")\n",
    "inventory_df.write.mode(\"overwrite\").saveAsTable(\"retail_demo.inventory_data\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5bd53006-ddbf-4ecf-bf3b-fa173cf9893b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import expr\n",
    "\n",
    "# Load sales and inventory data\n",
    "sales = spark.sql(\"\"\"\n",
    "    SELECT store_id, product_id, sale_date, units_sold\n",
    "    FROM retail_demo.sales_data\n",
    "\"\"\")\n",
    "\n",
    "inventory = spark.sql(\"\"\"\n",
    "    SELECT store_id, product_id, inventory_date, current_stock\n",
    "    FROM retail_demo.inventory_data\n",
    "\"\"\")\n",
    "\n",
    "# Step 2: Aggregate demand per store/product (e.g. last 7 days)\n",
    "demand = sales.groupBy(\"store_id\", \"product_id\") \\\n",
    "    .agg({\"units_sold\": \"avg\"}) \\\n",
    "    .withColumnRenamed(\"avg(units_sold)\", \"predicted_demand\")\n",
    "\n",
    "# Step 3: Join with inventory\n",
    "restock_plan = demand.join(inventory, on=[\"store_id\", \"product_id\"], how=\"inner\")\n",
    "\n",
    "# Step 4: Calculate restock quantity\n",
    "restock_plan = restock_plan.withColumn(\n",
    "    \"restock_qty\",\n",
    "    expr(\"CASE WHEN predicted_demand - current_stock > 0 THEN int(predicted_demand - current_stock) ELSE 0 END\")\n",
    ")\n",
    "\n",
    "# Step 5: Add metadata\n",
    "restock_plan = restock_plan.withColumn(\"run_date\", expr(\"current_date()\"))\n",
    "\n",
    "# Step 6: Save to Iceberg\n",
    "restock_plan.write.mode(\"overwrite\").saveAsTable(\"retail_demo.restock_plan\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fea9096-b7b3-44d4-91aa-189ac9ea4a33",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert the Spark DataFrame to a Pandas DataFrame for better display in Jupyter\n",
    "restock_df = restock_plan.toPandas()\n",
    "\n",
    "# Display the restock plan nicely in Jupyter using HTML table format\n",
    "import IPython.display as display\n",
    "display.display(display.HTML(restock_df.to_html(index=False)))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}