Initial commit

This commit is contained in:
Tim_Doernemann 2026-05-28 17:44:11 +02:00
parent d4028fca11
commit a578239c4f
32 changed files with 2559 additions and 0 deletions

View file

@ -0,0 +1,36 @@
# Dremio 25 only
def get_nessie_token(tokenendpoint, password):
import requests
# Exchange Dremio PAT to Nessie token
token_request_body = {
"grant_type": "urn:ietf:params:oauth:grant-type:token-exchange",
"scope": "dremio.all",
"subject_token_type": "urn:ietf:params:oauth:token-type:dremio:personal-access-token",
"subject_token": password,
}
x = requests.post(tokenendpoint, data=token_request_body)
x.raise_for_status()
return x.json()["access_token"]
def get_spark_session(host, nessie_token):
import stackit_spark
catalog_name_in_spark = "stackit"
return stackit_spark.get_spark(
additional_config={
"spark.jars.packages": "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1",
f"spark.sql.catalog.{catalog_name_in_spark}": "org.apache.iceberg.spark.SparkCatalog",
f"spark.sql.catalog.{catalog_name_in_spark}.type": "rest",
f"spark.sql.catalog.{catalog_name_in_spark}.uri": host,
f"spark.sql.catalog.{catalog_name_in_spark}.token": nessie_token,
}
)
if __name__ == "__main__":
get_nessie_token(
"https://dremio-internal.data-platform-dev.stackit.run/oauth/token",
"xxxxx",
)

View file

@ -0,0 +1,150 @@
import time
import pyarrow as pa
from pyarrow import flight
import logging
logger = logging.getLogger(__file__)
def initialize_flight(flight_uri: str, username, password):
"""Initialize flight connection for a configuration"""
client = flight.FlightClient(flight_uri)
headers = []
initial_options = flight.FlightCallOptions(headers=headers)
auth_header = client.authenticate_basic_token(username, password, initial_options)
return client, auth_header
def read_query_arrow(
client: flight.FlightClient,
auth_header: tuple,
query: str,
return_iterator: bool = False,
tqdm=None,
):
"""Load Dremio query to Arrow Table using Flight
Parameters
----------
client: flight.FlightClient
The flight client to be used.
auth_header: (str, str)
The bearer token to be used.
query : str
The dremio query to be executed.
return_iterator : bool, optional
If True, an iterator yielding Arrow Tables is returned instead of the
whole DataFrame.
return_only_info : bool, optional
If True, only the flight info is returned. This is useful if you want to
get the schema of a query without actually loading the data.
Default: False
tqdm : tqdm.tqdm
tqdm used for progress bar.
Returns
-------
tbl : pyarrow.Table
The query result. If return_iterator is True, returns an Iterator yielding
Arrow Tables
"""
options = flight.FlightCallOptions(
headers=[auth_header],
)
flight_desc = flight.FlightDescriptor.for_command(command=query)
flight_info = client.get_flight_info(flight_desc, options)
# Retrieve the result set as a stream of Arrow record batches.
reader = client.do_get(flight_info.endpoints[0].ticket, options)
def _generator(_return_table=False):
pbar = None if tqdm is None else tqdm(desc="Flight Transfer Chunk")
while True:
if pbar is not None:
pbar.update()
try:
batch, _ = reader.read_chunk()
if _return_table:
batch = pa.Table.from_batches([batch])
yield batch
except StopIteration:
break
batches = _generator(_return_table=return_iterator)
if not return_iterator:
t = time.time()
data = pa.Table.from_batches(batches)
logger.debug(f"Loaded query {query} via flight in {time.time()-t:.2f}s.")
return data
else:
return batches
def read_query_pandas(
client: flight.FlightClient,
auth_header: tuple,
query,
pandas_options: dict = None,
return_iterator: bool = False,
tqdm=None,
):
"""Load Dremio query to Pandas DataFrame using Flight
Parameters
----------
client: flight.FlightClient
The flight client to be used.
auth_header: (str, str)
The bearer token to be used.
query : str
The dremio query to be executed.
additional_headers : list, optional
Additional headers to be passed in the ``flight.FlightCallOptions``.
pandas_options : dict , optional
Additional kwargs passed to ``pyarrow._flight.FlightStreamReader.read_pandas``.
return_iterator : bool, optional
If True, an iterator yielding DataFrames is returned instead of the
whole DataFrame.
tqdm : tqdm.tqdm
tqdm used for progress bar.
Returns
-------
df : pandas.DataFrame
The query result. If ``return_iterator`` is True, returns an Iterator yielding
pandas DataFrames.
"""
pandas_options = pandas_options or {}
r = read_query_arrow(
query=query,
auth_header=auth_header,
client=client,
return_iterator=return_iterator,
tqdm=tqdm,
)
if return_iterator:
return (df.to_pandas(**pandas_options) for df in r)
else:
return r.to_pandas(**pandas_options)
if __name__ == "__main__":
client, auth_header = initialize_flight(
flight_uri="grpc+tls://dremio-internal-sql.data-platform-dev.stackit.run:32010",
username="Christian.Thiel_ext@external.mail.schwarz",
password="xxxx",
)
data = read_query_pandas(
client,
auth_header,
query="SELECT 1",
)
print(data)

2
Demo/tools/say_hello.py Normal file
View file

@ -0,0 +1,2 @@
def say_hello():
print("Hello from imported function!")