Initial commit
This commit is contained in:
parent
d4028fca11
commit
a578239c4f
32 changed files with 2559 additions and 0 deletions
35
Demo/scripts/my_tools/catalog_spark.py
Normal file
35
Demo/scripts/my_tools/catalog_spark.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
def get_nessie_token(tokenendpoint, password):
|
||||
import requests
|
||||
|
||||
# Exchange Dremio PAT to Nessie token
|
||||
token_request_body = {
|
||||
"grant_type": "urn:ietf:params:oauth:grant-type:token-exchange",
|
||||
"scope": "dremio.all",
|
||||
"subject_token_type": "urn:ietf:params:oauth:token-type:dremio:personal-access-token",
|
||||
"subject_token": password,
|
||||
}
|
||||
x = requests.post(tokenendpoint, data=token_request_body)
|
||||
x.raise_for_status()
|
||||
return x.json()["access_token"]
|
||||
|
||||
|
||||
def get_spark_session(host, nessie_token):
|
||||
import stackit_spark
|
||||
|
||||
catalog_name_in_spark = "stackit"
|
||||
return stackit_spark.get_spark(
|
||||
additional_config={
|
||||
"spark.jars.packages": "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1",
|
||||
f"spark.sql.catalog.{catalog_name_in_spark}": "org.apache.iceberg.spark.SparkCatalog",
|
||||
f"spark.sql.catalog.{catalog_name_in_spark}.type": "rest",
|
||||
f"spark.sql.catalog.{catalog_name_in_spark}.uri": host,
|
||||
f"spark.sql.catalog.{catalog_name_in_spark}.token": nessie_token,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
get_nessie_token(
|
||||
"https://dremio-internal.data-platform-dev.stackit.run/oauth/token",
|
||||
"xxxxx",
|
||||
)
|
||||
65
Demo/scripts/my_tools/read_write_data.py
Normal file
65
Demo/scripts/my_tools/read_write_data.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
|
||||
def read_and_write_data():
|
||||
import stackit_spark
|
||||
import pandas as pd
|
||||
import os
|
||||
import boto3
|
||||
import pyarrow.parquet as pq
|
||||
from io import BytesIO
|
||||
|
||||
# Initialize the S3 client
|
||||
s3 = boto3.client(
|
||||
's3',
|
||||
aws_access_key_id='J902D929PQ1BC1HG93ZS',
|
||||
aws_secret_access_key='J7x29/4s6eDQ0T1UywB8xo1byWTetwXrCdvYoudH',
|
||||
endpoint_url='https://object.storage.eu01.onstackit.cloud'
|
||||
)
|
||||
|
||||
bucket_name = 'data-tpcds'
|
||||
folder_names = [
|
||||
's1000/time_dim',
|
||||
's1000/item',
|
||||
's1000/date_dim',
|
||||
's1000/customer_demographics',
|
||||
's1000/web_sales_',
|
||||
's1000/customer1'
|
||||
]
|
||||
|
||||
result_dfs = {}
|
||||
spark = stackit_spark.get_spark()
|
||||
spark.sql("USE lakehouse")
|
||||
spark.sql("CREATE NAMESPACE IF NOT EXISTS DEMO_USECASE_AIRFLOW")
|
||||
|
||||
for folder_name in folder_names:
|
||||
objs = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)['Contents']
|
||||
|
||||
df_list = []
|
||||
for obj in objs:
|
||||
key = obj['Key']
|
||||
try:
|
||||
# Get object from S3
|
||||
response = s3.get_object(Bucket=bucket_name, Key=key)
|
||||
body = response['Body'].read() # Read the object's bytes
|
||||
# Read Parquet file using PyArrow
|
||||
table = pq.read_table(BytesIO(body))
|
||||
df_part = table.to_pandas()
|
||||
|
||||
df_list.append(df_part)
|
||||
except Exception as e:
|
||||
print(f"Error processing file {key} in folder {folder_name}: {e}")
|
||||
|
||||
# Concatenate the DataFrames for the current folder
|
||||
if df_list:
|
||||
df = pd.concat(df_list, ignore_index=True)
|
||||
result_dfs[folder_name] = df
|
||||
else:
|
||||
result_dfs[folder_name] = None
|
||||
|
||||
for df in result_dfs:
|
||||
table_name=f"DEMO_USECASE_AIRFLOW.{df.replace('/', '_')}"
|
||||
spark_df = spark.createDataFrame(result_dfs[df])
|
||||
# Save Spark DataFrame as an Iceberg table
|
||||
spark_df.write.mode("overwrite").saveAsTable(table_name)
|
||||
|
||||
|
||||
read_and_write_data()
|
||||
65
Demo/scripts/my_tools/read_write_table.py
Normal file
65
Demo/scripts/my_tools/read_write_table.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
|
||||
def read_and_write_data():
|
||||
import stackit_spark
|
||||
import pandas as pd
|
||||
import os
|
||||
import boto3
|
||||
import pyarrow.parquet as pq
|
||||
from io import BytesIO
|
||||
|
||||
# Initialize the S3 client
|
||||
s3 = boto3.client(
|
||||
's3',
|
||||
aws_access_key_id='J902D929PQ1BC1HG93ZS',
|
||||
aws_secret_access_key='J7x29/4s6eDQ0T1UywB8xo1byWTetwXrCdvYoudH',
|
||||
endpoint_url='https://object.storage.eu01.onstackit.cloud'
|
||||
)
|
||||
|
||||
bucket_name = 'data-tpcds'
|
||||
folder_name = 's1000/customer_address'
|
||||
objs = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)['Contents']
|
||||
objs = objs[:1]
|
||||
|
||||
df_list = []
|
||||
for obj in objs:
|
||||
key = obj['Key']
|
||||
# Get object from S3
|
||||
response = s3.get_object(Bucket=bucket_name, Key=key)
|
||||
body = response['Body'].read() # Read the object's bytes
|
||||
table = pq.read_table(BytesIO(body))
|
||||
df_part = table.to_pandas()
|
||||
|
||||
df_list.append(df_part)
|
||||
|
||||
df = pd.concat(df_list, ignore_index=True)
|
||||
|
||||
|
||||
spark = stackit_spark.get_spark()
|
||||
spark.sql("USE lakehouse")
|
||||
|
||||
spark.sql("CREATE NAMESPACE IF NOT EXISTS DEMO_USECASE_AIRFLOW")
|
||||
spark_df = spark.createDataFrame(df)
|
||||
# Save Spark DataFrame as an Iceberg table
|
||||
spark_df.write.mode("overwrite").saveAsTable("DEMO_USECASE_AIRFLOW.customer_address")
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
#from dotenv import load_dotenv
|
||||
import os
|
||||
#load_dotenv()
|
||||
#for key, value in os.environ.items():
|
||||
# print(f'{key}: {value}')
|
||||
|
||||
|
||||
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__S3__ACCESS-KEY-ID'] = 'DJ6STN2PRVEH6XIMP56V'
|
||||
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__CATALOG-IMPL'] = 'org.apache.iceberg.nessie.NessieCatalog'
|
||||
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__S3__SECRET-ACCESS-KEY'] = 'nG/AheODBfMcEhZL/cR+BQrQmO79Hia7nqweMu+n'
|
||||
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__IO-IMPL'] = 'org.apache.iceberg.aws.s3.S3FileIO'
|
||||
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE'] = 'org.apache.iceberg.spark.SparkCatalog'
|
||||
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__URI'] = 'http://nessie-internal.nessie-ns.svc.cluster.local:19120/api/v1'
|
||||
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__S3__ENDPOINT'] = 'https://object.storage.eu01.onstackit.cloud'
|
||||
os.environ['STACKIT__SPARK__SQL__CATALOG__LAKEHOUSE__WAREHOUSE'] = 's3://data-platform-playground-internal/warehouse'
|
||||
|
||||
read_and_write_data()
|
||||
2
Demo/scripts/my_tools/say_hello.py
Normal file
2
Demo/scripts/my_tools/say_hello.py
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
def say_hello():
|
||||
print("Hello from say_hello()")
|
||||
Loading…
Add table
Add a link
Reference in a new issue