Initial commit

2026-05-28 17:44:11 +02:00 · 2026-05-28 17:44:11 +02:00 · a578239c4f
commit a578239c4f
parent d4028fca11
32 changed files with 2559 additions and 0 deletions
--- a/Demo/scripts/basic_python_script.py
+++ b/Demo/scripts/basic_python_script.py
@ -0,0 +1,8 @@
+# Print current date
+import datetime
+
+# You can libraries baked into your own Docker image here!
+# import awesome_lib
+
+print("Hello from basic_python_script.py")
+print(f"The current date is {datetime.datetime.now()}")
--- a/Demo/scripts/my_dremio_query.py
+++ b/Demo/scripts/my_dremio_query.py
@ -0,0 +1,42 @@
+"""
+  Copyright (C) 2017-2021 Dremio Corporation
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+"""
+from dremio.arguments.parse import options_default_validator
+from dremio.flight.endpoint import DremioFlightEndpoint
+
+def execute_query(dremio_host: str, dremio_port: int, username: str, token: str, query: str):
+
+    args = {
+        'hostname': dremio_host, 
+        'port': dremio_port, 
+        'tls': True, 
+        'username': username, 
+        'token': token, 
+        'query': query
+    }
+    
+    config = options_default_validator["default"] | args
+
+    # Instantiate DremioFlightEndpoint object
+    dremio_flight_endpoint = DremioFlightEndpoint(config)
+
+    # Connect to Dremio Arrow Flight server endpoint.
+    flight_client = dremio_flight_endpoint.connect()
+
+    # Get reader
+    reader = dremio_flight_endpoint.get_reader(flight_client)
+
+    # Print out the data as a dataframe
+    print(reader.read_pandas())
--- a/Demo/scripts/my_spark_job.py
+++ b/Demo/scripts/my_spark_job.py
@ -0,0 +1,9 @@
+import stackit_spark
+import time
+import pandas as pd
+
+spark = stackit_spark.get_spark()
+data = pd.DataFrame({"number": [10]})
+df = spark.createDataFrame(data)
+time.sleep(30)
+df.show()
--- a/Demo/scripts/my_spark_job_with_imports.py
+++ b/Demo/scripts/my_spark_job_with_imports.py
@ -0,0 +1,10 @@
+import stackit_spark
+import pandas as pd
+from my_tools.say_hello import say_hello
+
+say_hello()
+
+spark = stackit_spark.get_spark()
+data = pd.DataFrame({"number": [10]})
+df = spark.createDataFrame(data)
+df.show()
--- a/Demo/scripts/my_spark_notebook.ipynb
+++ b/Demo/scripts/my_spark_notebook.ipynb
@ -0,0 +1,72 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from stackit_spark import get_spark\n",
+    "\n",
+    "from my_tools.catalog_spark import get_nessie_token\n",
+    "\n",
+    "if \"STACKIT__PAPERMILL\" in os.environ:\n",
+    "    # Create Spark Session with Iceberg Rest Credentials for Dremio Enterprise Catalog\n",
+    "    catalog_name_in_spark = \"stackit\"\n",
+    "    spark = get_spark()\n",
+    "    spark.sql(f\"USE {catalog_name_in_spark}\")\n",
+    "\n",
+    "    sdf = spark.sql(f\"SELECT * FROM DEMO.user\")\n",
+    "    sdf.show()\n",
+    "    \n",
+    "else:\n",
+    "    tokenendpoint = os.environ[\"TOKEN_ENDPOINT\"]\n",
+    "    catalogendpoint = os.environ[\"CATALOG_ENDPOINT\"]\n",
+    "    password = os.environ[\"DREMIO_PAT\"]\n",
+    "\n",
+    "\n",
+    "    nessie_token = get_nessie_token(tokenendpoint, password)\n",
+    "\n",
+    "        # Create Spark Session with Iceberg Rest Credentials for Dremio Enterprise Catalog\n",
+    "    catalog_name_in_spark = \"stackit\"\n",
+    "    spark = get_spark(\n",
+    "        additional_config={\n",
+    "            \"spark.jars.packages\": \"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1\",\n",
+    "            f\"spark.sql.catalog.{catalog_name_in_spark}\": \"org.apache.iceberg.spark.SparkCatalog\",\n",
+    "            f\"spark.sql.catalog.{catalog_name_in_spark}.type\": \"rest\",\n",
+    "            f\"spark.sql.catalog.{catalog_name_in_spark}.warehouse\": \"catalog-s3\",\n",
+    "            f\"spark.sql.catalog.{catalog_name_in_spark}.uri\": catalogendpoint,\n",
+    "            f\"spark.sql.catalog.{catalog_name_in_spark}.token\": nessie_token,\n",
+    "        }\n",
+    "    )\n",
+    "\n",
+    "    spark.sql(f\"USE {catalog_name_in_spark}\")\n",
+    "\n",
+    "    sdf = spark.sql(f\"SELECT * FROM DEMO.user\")\n",
+    "    sdf.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "stackit-papermill",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/Demo/scripts/my_tools/catalog_spark.py
+++ b/Demo/scripts/my_tools/catalog_spark.py
@ -0,0 +1,35 @@
+def get_nessie_token(tokenendpoint, password):
+    import requests
+
+    # Exchange Dremio PAT to Nessie token
+    token_request_body = {
+        "grant_type": "urn:ietf:params:oauth:grant-type:token-exchange",
+        "scope": "dremio.all",
+        "subject_token_type": "urn:ietf:params:oauth:token-type:dremio:personal-access-token",
+        "subject_token": password,
+    }
+    x = requests.post(tokenendpoint, data=token_request_body)
+    x.raise_for_status()
+    return x.json()["access_token"]
+
+
+def get_spark_session(host, nessie_token):
+    import stackit_spark
+
+    catalog_name_in_spark = "stackit"
+    return stackit_spark.get_spark(
+        additional_config={
+            "spark.jars.packages": "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1",
+            f"spark.sql.catalog.{catalog_name_in_spark}": "org.apache.iceberg.spark.SparkCatalog",
+            f"spark.sql.catalog.{catalog_name_in_spark}.type": "rest",
+            f"spark.sql.catalog.{catalog_name_in_spark}.uri": host,
+            f"spark.sql.catalog.{catalog_name_in_spark}.token": nessie_token,
+        }
+    )
+
+
+if __name__ == "__main__":
+    get_nessie_token(
+        "https://dremio-internal.data-platform-dev.stackit.run/oauth/token",
+        "xxxxx",
+    )
--- a/Demo/scripts/my_tools/read_write_data.py
+++ b/Demo/scripts/my_tools/read_write_data.py
@ -0,0 +1,65 @@
+
+def read_and_write_data():
+    import stackit_spark
+    import pandas as pd
+    import os
+    import boto3
+    import pyarrow.parquet as pq
+    from io import BytesIO
+
+        # Initialize the S3 client
+    s3 = boto3.client(
+        's3',
+        aws_access_key_id='J902D929PQ1BC1HG93ZS',
+        aws_secret_access_key='J7x29/4s6eDQ0T1UywB8xo1byWTetwXrCdvYoudH',
+        endpoint_url='https://object.storage.eu01.onstackit.cloud'
+    )
+    
+    bucket_name = 'data-tpcds'
+    folder_names = [
+        's1000/time_dim', 
+        's1000/item', 
+        's1000/date_dim',
+        's1000/customer_demographics',
+        's1000/web_sales_',
+        's1000/customer1'
+    ]
+    
+    result_dfs = {}
+    spark = stackit_spark.get_spark()
+    spark.sql("USE lakehouse")
+    spark.sql("CREATE NAMESPACE IF NOT EXISTS DEMO_USECASE_AIRFLOW")
+    
+    for folder_name in folder_names:
+        objs = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)['Contents']
+        
+        df_list = []
+        for obj in objs:
+            key = obj['Key']
+            try:
+                # Get object from S3
+                response = s3.get_object(Bucket=bucket_name, Key=key)
+                body = response['Body'].read()  # Read the object's bytes
+                # Read Parquet file using PyArrow
+                table = pq.read_table(BytesIO(body))
+                df_part = table.to_pandas()
+                
+                df_list.append(df_part)
+            except Exception as e:
+                print(f"Error processing file {key} in folder {folder_name}: {e}")
+
+        # Concatenate the DataFrames for the current folder
+        if df_list:
+            df = pd.concat(df_list, ignore_index=True)
+            result_dfs[folder_name] = df
+        else:
+            result_dfs[folder_name] = None
+        
+    for df in result_dfs:
+        table_name=f"DEMO_USECASE_AIRFLOW.{df.replace('/', '_')}"
+        spark_df = spark.createDataFrame(result_dfs[df])
+        # Save Spark DataFrame as an Iceberg table
+        spark_df.write.mode("overwrite").saveAsTable(table_name)
+       
+
+read_and_write_data()
--- a/Demo/scripts/my_tools/read_write_table.py
+++ b/Demo/scripts/my_tools/read_write_table.py
@ -0,0 +1,65 @@
+
+def read_and_write_data():
+    import stackit_spark
+    import pandas as pd
+    import os
+    import boto3
+    import pyarrow.parquet as pq
+    from io import BytesIO
+
+        # Initialize the S3 client
+    s3 = boto3.client(
+        's3',
+        aws_access_key_id='J902D929PQ1BC1HG93ZS',
+        aws_secret_access_key='J7x29/4s6eDQ0T1UywB8xo1byWTetwXrCdvYoudH',
+        endpoint_url='https://object.storage.eu01.onstackit.cloud'
+    )
+    
+    bucket_name = 'data-tpcds'
+    folder_name = 's1000/customer_address' 
+    objs = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)['Contents']
+    objs = objs[:1]
+       
+    df_list = []
+    for obj in objs:
+        key = obj['Key']
+        # Get object from S3
+        response = s3.get_object(Bucket=bucket_name, Key=key)
+        body = response['Body'].read()  # Read the object's bytes
+        table = pq.read_table(BytesIO(body))
+        df_part = table.to_pandas()
+
+        df_list.append(df_part)
+
+    df = pd.concat(df_list, ignore_index=True)
+        
+
+    spark = stackit_spark.get_spark()
+    spark.sql("USE lakehouse")
+
+    spark.sql("CREATE NAMESPACE IF NOT EXISTS DEMO_USECASE_AIRFLOW")
+    spark_df = spark.createDataFrame(df)
+    # Save Spark DataFrame as an Iceberg table
+    spark_df.write.mode("overwrite").saveAsTable("DEMO_USECASE_AIRFLOW.customer_address")
+   
+
+
+
+if __name__ == "__main__":
+    #from dotenv import load_dotenv
+    import os
+    #load_dotenv()
+    #for key, value in os.environ.items():
+    #    print(f'{key}: {value}')
+
+    
+    os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__S3__ACCESS-KEY-ID'] = 'DJ6STN2PRVEH6XIMP56V'
+    os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__CATALOG-IMPL'] = 'org.apache.iceberg.nessie.NessieCatalog'
+    os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__S3__SECRET-ACCESS-KEY'] = 'nG/AheODBfMcEhZL/cR+BQrQmO79Hia7nqweMu+n'
+    os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__IO-IMPL'] = 'org.apache.iceberg.aws.s3.S3FileIO'
+    os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE'] = 'org.apache.iceberg.spark.SparkCatalog'
+    os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__URI'] = 'http://nessie-internal.nessie-ns.svc.cluster.local:19120/api/v1'
+    os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__S3__ENDPOINT'] = 'https://object.storage.eu01.onstackit.cloud'
+    os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__WAREHOUSE'] = 's3://data-platform-playground-internal/warehouse'
+
+read_and_write_data()
--- a/Demo/scripts/my_tools/say_hello.py
+++ b/Demo/scripts/my_tools/say_hello.py
@ -0,0 +1,2 @@
+def say_hello():
+    print("Hello from say_hello()")
--- a/Demo/scripts/sales_prediction.ipynb
+++ b/Demo/scripts/sales_prediction.ipynb
@ -0,0 +1,182 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8aae8ae7-4cb4-4b66-b325-aa3eabdeb455",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_date = \"2025-05-08\"\n",
+    "input_sales = \"local.retail_sales\"\n",
+    "input_inventory = \"local.retail_inventory\"\n",
+    "output_table = \"local.restock_plan\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3598df66-6027-4f02-b7bd-c279b9d20c32",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create spark session with authentication to the catalog\n",
+    "\n",
+    "def get_catalog_token(tokenendpoint, password):\n",
+    "    import requests\n",
+    "\n",
+    "    # Exchange Dremio PAT to Nessie token\n",
+    "    token_request_body = {\n",
+    "        \"grant_type\": \"urn:ietf:params:oauth:grant-type:token-exchange\",\n",
+    "        \"scope\": \"dremio.all\",\n",
+    "        \"subject_token_type\": \"urn:ietf:params:oauth:token-type:dremio:personal-access-token\",\n",
+    "        \"subject_token\": password,\n",
+    "    }\n",
+    "    x = requests.post(tokenendpoint, data=token_request_body)\n",
+    "    x.raise_for_status()\n",
+    "    return x.json()[\"access_token\"]\n",
+    "\n",
+    "\n",
+    "def get_spark_session(host, catalog_token):\n",
+    "    import stackit_spark\n",
+    "\n",
+    "    catalog_name_in_spark = \"stackit\"\n",
+    "    return stackit_spark.get_spark(\n",
+    "        additional_config={\n",
+    "            \"spark.jars.packages\": \"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1\",\n",
+    "            f\"spark.sql.catalog.{catalog_name_in_spark}\": \"org.apache.iceberg.spark.SparkCatalog\",\n",
+    "            f\"spark.sql.catalog.{catalog_name_in_spark}.warehouse\": \"catalog-s3\",\n",
+    "            f\"spark.sql.catalog.{catalog_name_in_spark}.type\": \"rest\",\n",
+    "            f\"spark.sql.catalog.{catalog_name_in_spark}.uri\": host,\n",
+    "            f\"spark.sql.catalog.{catalog_name_in_spark}.token\": catalog_token,\n",
+    "        }\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9071bfb-1767-47b3-907f-723ce4389d9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "catalog_token = get_catalog_token(\"https://dremio-internal.data-platform.stackit.run/oauth/token\", \"JnHzzS1LRFeZw4HIJQNP+iGJereEuCehcZwyGwSxcZPSrX4H7NL6FGqOxf/lRw==\")\n",
+    "spark = get_spark_session(\"https://dremio-internal-catalog.data-platform.stackit.run/iceberg/main\", catalog_token)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c250ade-1565-4c95-b1ae-134724a2e77b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(f\"USE stackit\")\n",
+    "# Step 1: Create a namespace\n",
+    "spark.sql(\"CREATE NAMESPACE IF NOT EXISTS retail_demo\")\n",
+    "\n",
+    "# Step 2: Create synthetic sales data\n",
+    "sales_df = spark.range(100).selectExpr(\n",
+    "    \"date_add('2025-05-01', cast(rand() * 7 as int)) as sale_date\",\n",
+    "    \"cast(rand() * 10 + 1 as int) as units_sold\",\n",
+    "    \"case when cast(rand() * 3 as int) = 0 then 'A123' \"\n",
+    "    \"     when cast(rand() * 3 as int) = 1 then 'B456' \"\n",
+    "    \"     else 'C789' end as product_id\",\n",
+    "    \"case when cast(rand() * 2 as int) = 0 then '101' else '102' end as store_id\"\n",
+    ")\n",
+    "\n",
+    "# Step 3: Create synthetic inventory data\n",
+    "inventory_df = spark.range(10).selectExpr(\n",
+    "    \"'2025-05-08' as inventory_date\",\n",
+    "    \"cast(rand() * 20 + 10 as int) as current_stock\",\n",
+    "    \"case when cast(rand() * 3 as int) = 0 then 'A123' \"\n",
+    "    \"     when cast(rand() * 3 as int) = 1 then 'B456' \"\n",
+    "    \"     else 'C789' end as product_id\",\n",
+    "    \"case when cast(rand() * 2 as int) = 0 then '101' else '102' end as store_id\"\n",
+    ")\n",
+    "\n",
+    "# Step 4: Write to Iceberg tables\n",
+    "sales_df.write.mode(\"overwrite\").saveAsTable(\"retail_demo.sales_data\")\n",
+    "inventory_df.write.mode(\"overwrite\").saveAsTable(\"retail_demo.inventory_data\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5bd53006-ddbf-4ecf-bf3b-fa173cf9893b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.sql.functions import expr\n",
+    "\n",
+    "# Load sales and inventory data\n",
+    "sales = spark.sql(\"\"\"\n",
+    "    SELECT store_id, product_id, sale_date, units_sold\n",
+    "    FROM retail_demo.sales_data\n",
+    "\"\"\")\n",
+    "\n",
+    "inventory = spark.sql(\"\"\"\n",
+    "    SELECT store_id, product_id, inventory_date, current_stock\n",
+    "    FROM retail_demo.inventory_data\n",
+    "\"\"\")\n",
+    "\n",
+    "# Step 2: Aggregate demand per store/product (e.g. last 7 days)\n",
+    "demand = sales.groupBy(\"store_id\", \"product_id\") \\\n",
+    "    .agg({\"units_sold\": \"avg\"}) \\\n",
+    "    .withColumnRenamed(\"avg(units_sold)\", \"predicted_demand\")\n",
+    "\n",
+    "# Step 3: Join with inventory\n",
+    "restock_plan = demand.join(inventory, on=[\"store_id\", \"product_id\"], how=\"inner\")\n",
+    "\n",
+    "# Step 4: Calculate restock quantity\n",
+    "restock_plan = restock_plan.withColumn(\n",
+    "    \"restock_qty\",\n",
+    "    expr(\"CASE WHEN predicted_demand - current_stock > 0 THEN int(predicted_demand - current_stock) ELSE 0 END\")\n",
+    ")\n",
+    "\n",
+    "# Step 5: Add metadata\n",
+    "restock_plan = restock_plan.withColumn(\"run_date\", expr(\"current_date()\"))\n",
+    "\n",
+    "# Step 6: Save to Iceberg\n",
+    "restock_plan.write.mode(\"overwrite\").saveAsTable(\"retail_demo.restock_plan\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fea9096-b7b3-44d4-91aa-189ac9ea4a33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert the Spark DataFrame to a Pandas DataFrame for better display in Jupyter\n",
+    "restock_df = restock_plan.toPandas()\n",
+    "\n",
+    "# Display the restock plan nicely in Jupyter using HTML table format\n",
+    "import IPython.display as display\n",
+    "display.display(display.HTML(restock_df.to_html(index=False)))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}