def read_and_write_data(): import stackit_spark import pandas as pd import os import boto3 import pyarrow.parquet as pq from io import BytesIO # Initialize the S3 client s3 = boto3.client( 's3', aws_access_key_id='J902D929PQ1BC1HG93ZS', aws_secret_access_key='J7x29/4s6eDQ0T1UywB8xo1byWTetwXrCdvYoudH', endpoint_url='https://object.storage.eu01.onstackit.cloud' ) bucket_name = 'data-tpcds' folder_names = [ 's1000/time_dim', 's1000/item', 's1000/date_dim', 's1000/customer_demographics', 's1000/web_sales_', 's1000/customer1' ] result_dfs = {} spark = stackit_spark.get_spark() spark.sql("USE lakehouse") spark.sql("CREATE NAMESPACE IF NOT EXISTS DEMO_USECASE_AIRFLOW") for folder_name in folder_names: objs = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)['Contents'] df_list = [] for obj in objs: key = obj['Key'] try: # Get object from S3 response = s3.get_object(Bucket=bucket_name, Key=key) body = response['Body'].read() # Read the object's bytes # Read Parquet file using PyArrow table = pq.read_table(BytesIO(body)) df_part = table.to_pandas() df_list.append(df_part) except Exception as e: print(f"Error processing file {key} in folder {folder_name}: {e}") # Concatenate the DataFrames for the current folder if df_list: df = pd.concat(df_list, ignore_index=True) result_dfs[folder_name] = df else: result_dfs[folder_name] = None for df in result_dfs: table_name=f"DEMO_USECASE_AIRFLOW.{df.replace('/', '_')}" spark_df = spark.createDataFrame(result_dfs[df]) # Save Spark DataFrame as an Iceberg table spark_df.write.mode("overwrite").saveAsTable(table_name) read_and_write_data()