65 lines
No EOL
2.1 KiB
Python
65 lines
No EOL
2.1 KiB
Python
|
|
def read_and_write_data():
|
|
import stackit_spark
|
|
import pandas as pd
|
|
import os
|
|
import boto3
|
|
import pyarrow.parquet as pq
|
|
from io import BytesIO
|
|
|
|
# Initialize the S3 client
|
|
s3 = boto3.client(
|
|
's3',
|
|
aws_access_key_id='J902D929PQ1BC1HG93ZS',
|
|
aws_secret_access_key='J7x29/4s6eDQ0T1UywB8xo1byWTetwXrCdvYoudH',
|
|
endpoint_url='https://object.storage.eu01.onstackit.cloud'
|
|
)
|
|
|
|
bucket_name = 'data-tpcds'
|
|
folder_names = [
|
|
's1000/time_dim',
|
|
's1000/item',
|
|
's1000/date_dim',
|
|
's1000/customer_demographics',
|
|
's1000/web_sales_',
|
|
's1000/customer1'
|
|
]
|
|
|
|
result_dfs = {}
|
|
spark = stackit_spark.get_spark()
|
|
spark.sql("USE lakehouse")
|
|
spark.sql("CREATE NAMESPACE IF NOT EXISTS DEMO_USECASE_AIRFLOW")
|
|
|
|
for folder_name in folder_names:
|
|
objs = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)['Contents']
|
|
|
|
df_list = []
|
|
for obj in objs:
|
|
key = obj['Key']
|
|
try:
|
|
# Get object from S3
|
|
response = s3.get_object(Bucket=bucket_name, Key=key)
|
|
body = response['Body'].read() # Read the object's bytes
|
|
# Read Parquet file using PyArrow
|
|
table = pq.read_table(BytesIO(body))
|
|
df_part = table.to_pandas()
|
|
|
|
df_list.append(df_part)
|
|
except Exception as e:
|
|
print(f"Error processing file {key} in folder {folder_name}: {e}")
|
|
|
|
# Concatenate the DataFrames for the current folder
|
|
if df_list:
|
|
df = pd.concat(df_list, ignore_index=True)
|
|
result_dfs[folder_name] = df
|
|
else:
|
|
result_dfs[folder_name] = None
|
|
|
|
for df in result_dfs:
|
|
table_name=f"DEMO_USECASE_AIRFLOW.{df.replace('/', '_')}"
|
|
spark_df = spark.createDataFrame(result_dfs[df])
|
|
# Save Spark DataFrame as an Iceberg table
|
|
spark_df.write.mode("overwrite").saveAsTable(table_name)
|
|
|
|
|
|
read_and_write_data() |