Initial commit

This commit is contained in:
Tim_Doernemann 2026-05-28 17:44:11 +02:00
parent d4028fca11
commit a578239c4f
32 changed files with 2559 additions and 0 deletions

View file

@ -0,0 +1,8 @@
# Print current date
import datetime
# You can libraries baked into your own Docker image here!
# import awesome_lib
print("Hello from basic_python_script.py")
print(f"The current date is {datetime.datetime.now()}")

View file

@ -0,0 +1,42 @@
"""
Copyright (C) 2017-2021 Dremio Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from dremio.arguments.parse import options_default_validator
from dremio.flight.endpoint import DremioFlightEndpoint
def execute_query(dremio_host: str, dremio_port: int, username: str, token: str, query: str):
args = {
'hostname': dremio_host,
'port': dremio_port,
'tls': True,
'username': username,
'token': token,
'query': query
}
config = options_default_validator["default"] | args
# Instantiate DremioFlightEndpoint object
dremio_flight_endpoint = DremioFlightEndpoint(config)
# Connect to Dremio Arrow Flight server endpoint.
flight_client = dremio_flight_endpoint.connect()
# Get reader
reader = dremio_flight_endpoint.get_reader(flight_client)
# Print out the data as a dataframe
print(reader.read_pandas())

View file

@ -0,0 +1,9 @@
import stackit_spark
import time
import pandas as pd
spark = stackit_spark.get_spark()
data = pd.DataFrame({"number": [10]})
df = spark.createDataFrame(data)
time.sleep(30)
df.show()

View file

@ -0,0 +1,10 @@
import stackit_spark
import pandas as pd
from my_tools.say_hello import say_hello
say_hello()
spark = stackit_spark.get_spark()
data = pd.DataFrame({"number": [10]})
df = spark.createDataFrame(data)
df.show()

View file

@ -0,0 +1,72 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from stackit_spark import get_spark\n",
"\n",
"from my_tools.catalog_spark import get_nessie_token\n",
"\n",
"if \"STACKIT__PAPERMILL\" in os.environ:\n",
" # Create Spark Session with Iceberg Rest Credentials for Dremio Enterprise Catalog\n",
" catalog_name_in_spark = \"stackit\"\n",
" spark = get_spark()\n",
" spark.sql(f\"USE {catalog_name_in_spark}\")\n",
"\n",
" sdf = spark.sql(f\"SELECT * FROM DEMO.user\")\n",
" sdf.show()\n",
" \n",
"else:\n",
" tokenendpoint = os.environ[\"TOKEN_ENDPOINT\"]\n",
" catalogendpoint = os.environ[\"CATALOG_ENDPOINT\"]\n",
" password = os.environ[\"DREMIO_PAT\"]\n",
"\n",
"\n",
" nessie_token = get_nessie_token(tokenendpoint, password)\n",
"\n",
" # Create Spark Session with Iceberg Rest Credentials for Dremio Enterprise Catalog\n",
" catalog_name_in_spark = \"stackit\"\n",
" spark = get_spark(\n",
" additional_config={\n",
" \"spark.jars.packages\": \"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}\": \"org.apache.iceberg.spark.SparkCatalog\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.type\": \"rest\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.warehouse\": \"catalog-s3\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.uri\": catalogendpoint,\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.token\": nessie_token,\n",
" }\n",
" )\n",
"\n",
" spark.sql(f\"USE {catalog_name_in_spark}\")\n",
"\n",
" sdf = spark.sql(f\"SELECT * FROM DEMO.user\")\n",
" sdf.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "stackit-papermill",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View file

@ -0,0 +1,35 @@
def get_nessie_token(tokenendpoint, password):
import requests
# Exchange Dremio PAT to Nessie token
token_request_body = {
"grant_type": "urn:ietf:params:oauth:grant-type:token-exchange",
"scope": "dremio.all",
"subject_token_type": "urn:ietf:params:oauth:token-type:dremio:personal-access-token",
"subject_token": password,
}
x = requests.post(tokenendpoint, data=token_request_body)
x.raise_for_status()
return x.json()["access_token"]
def get_spark_session(host, nessie_token):
import stackit_spark
catalog_name_in_spark = "stackit"
return stackit_spark.get_spark(
additional_config={
"spark.jars.packages": "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1",
f"spark.sql.catalog.{catalog_name_in_spark}": "org.apache.iceberg.spark.SparkCatalog",
f"spark.sql.catalog.{catalog_name_in_spark}.type": "rest",
f"spark.sql.catalog.{catalog_name_in_spark}.uri": host,
f"spark.sql.catalog.{catalog_name_in_spark}.token": nessie_token,
}
)
if __name__ == "__main__":
get_nessie_token(
"https://dremio-internal.data-platform-dev.stackit.run/oauth/token",
"xxxxx",
)

View file

@ -0,0 +1,65 @@
def read_and_write_data():
import stackit_spark
import pandas as pd
import os
import boto3
import pyarrow.parquet as pq
from io import BytesIO
# Initialize the S3 client
s3 = boto3.client(
's3',
aws_access_key_id='J902D929PQ1BC1HG93ZS',
aws_secret_access_key='J7x29/4s6eDQ0T1UywB8xo1byWTetwXrCdvYoudH',
endpoint_url='https://object.storage.eu01.onstackit.cloud'
)
bucket_name = 'data-tpcds'
folder_names = [
's1000/time_dim',
's1000/item',
's1000/date_dim',
's1000/customer_demographics',
's1000/web_sales_',
's1000/customer1'
]
result_dfs = {}
spark = stackit_spark.get_spark()
spark.sql("USE lakehouse")
spark.sql("CREATE NAMESPACE IF NOT EXISTS DEMO_USECASE_AIRFLOW")
for folder_name in folder_names:
objs = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)['Contents']
df_list = []
for obj in objs:
key = obj['Key']
try:
# Get object from S3
response = s3.get_object(Bucket=bucket_name, Key=key)
body = response['Body'].read() # Read the object's bytes
# Read Parquet file using PyArrow
table = pq.read_table(BytesIO(body))
df_part = table.to_pandas()
df_list.append(df_part)
except Exception as e:
print(f"Error processing file {key} in folder {folder_name}: {e}")
# Concatenate the DataFrames for the current folder
if df_list:
df = pd.concat(df_list, ignore_index=True)
result_dfs[folder_name] = df
else:
result_dfs[folder_name] = None
for df in result_dfs:
table_name=f"DEMO_USECASE_AIRFLOW.{df.replace('/', '_')}"
spark_df = spark.createDataFrame(result_dfs[df])
# Save Spark DataFrame as an Iceberg table
spark_df.write.mode("overwrite").saveAsTable(table_name)
read_and_write_data()

View file

@ -0,0 +1,65 @@
def read_and_write_data():
import stackit_spark
import pandas as pd
import os
import boto3
import pyarrow.parquet as pq
from io import BytesIO
# Initialize the S3 client
s3 = boto3.client(
's3',
aws_access_key_id='J902D929PQ1BC1HG93ZS',
aws_secret_access_key='J7x29/4s6eDQ0T1UywB8xo1byWTetwXrCdvYoudH',
endpoint_url='https://object.storage.eu01.onstackit.cloud'
)
bucket_name = 'data-tpcds'
folder_name = 's1000/customer_address'
objs = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)['Contents']
objs = objs[:1]
df_list = []
for obj in objs:
key = obj['Key']
# Get object from S3
response = s3.get_object(Bucket=bucket_name, Key=key)
body = response['Body'].read() # Read the object's bytes
table = pq.read_table(BytesIO(body))
df_part = table.to_pandas()
df_list.append(df_part)
df = pd.concat(df_list, ignore_index=True)
spark = stackit_spark.get_spark()
spark.sql("USE lakehouse")
spark.sql("CREATE NAMESPACE IF NOT EXISTS DEMO_USECASE_AIRFLOW")
spark_df = spark.createDataFrame(df)
# Save Spark DataFrame as an Iceberg table
spark_df.write.mode("overwrite").saveAsTable("DEMO_USECASE_AIRFLOW.customer_address")
if __name__ == "__main__":
#from dotenv import load_dotenv
import os
#load_dotenv()
#for key, value in os.environ.items():
# print(f'{key}: {value}')
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__S3__ACCESS-KEY-ID'] = 'DJ6STN2PRVEH6XIMP56V'
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__CATALOG-IMPL'] = 'org.apache.iceberg.nessie.NessieCatalog'
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__S3__SECRET-ACCESS-KEY'] = 'nG/AheODBfMcEhZL/cR+BQrQmO79Hia7nqweMu+n'
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__IO-IMPL'] = 'org.apache.iceberg.aws.s3.S3FileIO'
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE'] = 'org.apache.iceberg.spark.SparkCatalog'
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__URI'] = 'http://nessie-internal.nessie-ns.svc.cluster.local:19120/api/v1'
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__S3__ENDPOINT'] = 'https://object.storage.eu01.onstackit.cloud'
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__WAREHOUSE'] = 's3://data-platform-playground-internal/warehouse'
read_and_write_data()

View file

@ -0,0 +1,2 @@
def say_hello():
print("Hello from say_hello()")

View file

@ -0,0 +1,182 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "8aae8ae7-4cb4-4b66-b325-aa3eabdeb455",
"metadata": {},
"outputs": [],
"source": [
"run_date = \"2025-05-08\"\n",
"input_sales = \"local.retail_sales\"\n",
"input_inventory = \"local.retail_inventory\"\n",
"output_table = \"local.restock_plan\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3598df66-6027-4f02-b7bd-c279b9d20c32",
"metadata": {},
"outputs": [],
"source": [
"# create spark session with authentication to the catalog\n",
"\n",
"def get_catalog_token(tokenendpoint, password):\n",
" import requests\n",
"\n",
" # Exchange Dremio PAT to Nessie token\n",
" token_request_body = {\n",
" \"grant_type\": \"urn:ietf:params:oauth:grant-type:token-exchange\",\n",
" \"scope\": \"dremio.all\",\n",
" \"subject_token_type\": \"urn:ietf:params:oauth:token-type:dremio:personal-access-token\",\n",
" \"subject_token\": password,\n",
" }\n",
" x = requests.post(tokenendpoint, data=token_request_body)\n",
" x.raise_for_status()\n",
" return x.json()[\"access_token\"]\n",
"\n",
"\n",
"def get_spark_session(host, catalog_token):\n",
" import stackit_spark\n",
"\n",
" catalog_name_in_spark = \"stackit\"\n",
" return stackit_spark.get_spark(\n",
" additional_config={\n",
" \"spark.jars.packages\": \"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}\": \"org.apache.iceberg.spark.SparkCatalog\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.warehouse\": \"catalog-s3\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.type\": \"rest\",\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.uri\": host,\n",
" f\"spark.sql.catalog.{catalog_name_in_spark}.token\": catalog_token,\n",
" }\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9071bfb-1767-47b3-907f-723ce4389d9e",
"metadata": {},
"outputs": [],
"source": [
"catalog_token = get_catalog_token(\"https://dremio-internal.data-platform.stackit.run/oauth/token\", \"JnHzzS1LRFeZw4HIJQNP+iGJereEuCehcZwyGwSxcZPSrX4H7NL6FGqOxf/lRw==\")\n",
"spark = get_spark_session(\"https://dremio-internal-catalog.data-platform.stackit.run/iceberg/main\", catalog_token)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c250ade-1565-4c95-b1ae-134724a2e77b",
"metadata": {},
"outputs": [],
"source": [
"spark.sql(f\"USE stackit\")\n",
"# Step 1: Create a namespace\n",
"spark.sql(\"CREATE NAMESPACE IF NOT EXISTS retail_demo\")\n",
"\n",
"# Step 2: Create synthetic sales data\n",
"sales_df = spark.range(100).selectExpr(\n",
" \"date_add('2025-05-01', cast(rand() * 7 as int)) as sale_date\",\n",
" \"cast(rand() * 10 + 1 as int) as units_sold\",\n",
" \"case when cast(rand() * 3 as int) = 0 then 'A123' \"\n",
" \" when cast(rand() * 3 as int) = 1 then 'B456' \"\n",
" \" else 'C789' end as product_id\",\n",
" \"case when cast(rand() * 2 as int) = 0 then '101' else '102' end as store_id\"\n",
")\n",
"\n",
"# Step 3: Create synthetic inventory data\n",
"inventory_df = spark.range(10).selectExpr(\n",
" \"'2025-05-08' as inventory_date\",\n",
" \"cast(rand() * 20 + 10 as int) as current_stock\",\n",
" \"case when cast(rand() * 3 as int) = 0 then 'A123' \"\n",
" \" when cast(rand() * 3 as int) = 1 then 'B456' \"\n",
" \" else 'C789' end as product_id\",\n",
" \"case when cast(rand() * 2 as int) = 0 then '101' else '102' end as store_id\"\n",
")\n",
"\n",
"# Step 4: Write to Iceberg tables\n",
"sales_df.write.mode(\"overwrite\").saveAsTable(\"retail_demo.sales_data\")\n",
"inventory_df.write.mode(\"overwrite\").saveAsTable(\"retail_demo.inventory_data\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5bd53006-ddbf-4ecf-bf3b-fa173cf9893b",
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql.functions import expr\n",
"\n",
"# Load sales and inventory data\n",
"sales = spark.sql(\"\"\"\n",
" SELECT store_id, product_id, sale_date, units_sold\n",
" FROM retail_demo.sales_data\n",
"\"\"\")\n",
"\n",
"inventory = spark.sql(\"\"\"\n",
" SELECT store_id, product_id, inventory_date, current_stock\n",
" FROM retail_demo.inventory_data\n",
"\"\"\")\n",
"\n",
"# Step 2: Aggregate demand per store/product (e.g. last 7 days)\n",
"demand = sales.groupBy(\"store_id\", \"product_id\") \\\n",
" .agg({\"units_sold\": \"avg\"}) \\\n",
" .withColumnRenamed(\"avg(units_sold)\", \"predicted_demand\")\n",
"\n",
"# Step 3: Join with inventory\n",
"restock_plan = demand.join(inventory, on=[\"store_id\", \"product_id\"], how=\"inner\")\n",
"\n",
"# Step 4: Calculate restock quantity\n",
"restock_plan = restock_plan.withColumn(\n",
" \"restock_qty\",\n",
" expr(\"CASE WHEN predicted_demand - current_stock > 0 THEN int(predicted_demand - current_stock) ELSE 0 END\")\n",
")\n",
"\n",
"# Step 5: Add metadata\n",
"restock_plan = restock_plan.withColumn(\"run_date\", expr(\"current_date()\"))\n",
"\n",
"# Step 6: Save to Iceberg\n",
"restock_plan.write.mode(\"overwrite\").saveAsTable(\"retail_demo.restock_plan\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5fea9096-b7b3-44d4-91aa-189ac9ea4a33",
"metadata": {},
"outputs": [],
"source": [
"# Convert the Spark DataFrame to a Pandas DataFrame for better display in Jupyter\n",
"restock_df = restock_plan.toPandas()\n",
"\n",
"# Display the restock plan nicely in Jupyter using HTML table format\n",
"import IPython.display as display\n",
"display.display(display.HTML(restock_df.to_html(index=False)))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}