seaweedfs/test/s3/parquet/example_pyarrow_native.py


								#!/usr/bin/env python3

								# /// script

								# dependencies = [

								#     "pyarrow>=22",

								#     "boto3>=1.28.0",

								# ]

								# ///


								"""

								Simple example of using PyArrow's native S3 filesystem with SeaweedFS.


								This is a minimal example demonstrating how to write and read Parquet files

								using PyArrow's built-in S3FileSystem without any additional dependencies

								like s3fs.


								Usage:

								    # Set environment variables

								    export S3_ENDPOINT_URL=localhost:8333

								    export S3_ACCESS_KEY=some_access_key1

								    export S3_SECRET_KEY=some_secret_key1

								    export BUCKET_NAME=test-parquet-bucket


								    # Run the script

								    python3 example_pyarrow_native.py


								    # Or run with uv (if available)

								    uv run example_pyarrow_native.py

								"""


								import os

								import secrets


								import pyarrow as pa

								import pyarrow.dataset as pads

								import pyarrow.fs as pafs

								import pyarrow.parquet as pq


								from parquet_test_utils import create_sample_table


								# Configuration

								BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")

								S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "localhost:8333")

								S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY", "some_access_key1")

								S3_SECRET_KEY = os.getenv("S3_SECRET_KEY", "some_secret_key1")


								# Determine scheme from endpoint

								if S3_ENDPOINT_URL.startswith("http://"):

								    scheme = "http"

								    endpoint = S3_ENDPOINT_URL[7:]

								elif S3_ENDPOINT_URL.startswith("https://"):

								    scheme = "https"

								    endpoint = S3_ENDPOINT_URL[8:]

								else:

								    scheme = "http"  # Default to http for localhost

								    endpoint = S3_ENDPOINT_URL


								print(f"Connecting to S3 endpoint: {scheme}://{endpoint}")


								# Initialize PyArrow's NATIVE S3 filesystem

								s3 = pafs.S3FileSystem(

								    access_key=S3_ACCESS_KEY,

								    secret_key=S3_SECRET_KEY,

								    endpoint_override=endpoint,

								    scheme=scheme,

								    allow_bucket_creation=True,

								    allow_bucket_deletion=True,

								)


								print("✓ Connected to S3 endpoint")


								# Create bucket if needed (using boto3)

								try:

								    import boto3

								    from botocore.exceptions import ClientError


								    s3_client = boto3.client(

								        's3',

								        endpoint_url=f"{scheme}://{endpoint}",

								        aws_access_key_id=S3_ACCESS_KEY,

								        aws_secret_access_key=S3_SECRET_KEY,

								        region_name='us-east-1',

								    )


								    try:

								        s3_client.head_bucket(Bucket=BUCKET_NAME)

								        print(f"✓ Bucket exists: {BUCKET_NAME}")

								    except ClientError as e:

								        if e.response['Error']['Code'] == '404':

								            print(f"Creating bucket: {BUCKET_NAME}")

								            s3_client.create_bucket(Bucket=BUCKET_NAME)

								            print(f"✓ Bucket created: {BUCKET_NAME}")

								        else:

								            raise

								except ImportError:

								    print("Warning: boto3 not available, assuming bucket exists")


								# Generate a unique filename

								filename = f"{BUCKET_NAME}/dataset-{secrets.token_hex(8)}/test.parquet"


								print(f"\nWriting Parquet dataset to: {filename}")


								# Write dataset

								table = create_sample_table(200_000)

								pads.write_dataset(

								    table,

								    filename,

								    filesystem=s3,

								    format="parquet",

								)


								print(f"✓ Wrote {table.num_rows:,} rows")


								# Read with pq.read_table

								print("\nReading with pq.read_table...")

								table_read = pq.read_table(filename, filesystem=s3)

								print(f"✓ Read {table_read.num_rows:,} rows")


								# Read with pq.ParquetDataset

								print("\nReading with pq.ParquetDataset...")

								dataset = pq.ParquetDataset(filename, filesystem=s3)

								table_dataset = dataset.read()

								print(f"✓ Read {table_dataset.num_rows:,} rows")


								# Read with pads.dataset

								print("\nReading with pads.dataset...")

								dataset_pads = pads.dataset(filename, filesystem=s3)

								table_pads = dataset_pads.to_table()

								print(f"✓ Read {table_pads.num_rows:,} rows")


								print("\n✅ All operations completed successfully!")

								print(f"\nFile written to: {filename}")

								print("You can verify the file using the SeaweedFS S3 API or weed shell")