#!/usr/bin/env python3
# /// script
# dependencies = [
#     "pyarrow>=22",
#     "boto3>=1.28.0",
# ]
# ///

"""
Simple example of using PyArrow's native S3 filesystem with SeaweedFS.

This is a minimal example demonstrating how to write and read Parquet files
using PyArrow's built-in S3FileSystem without any additional dependencies
like s3fs.

Usage:
    # Set environment variables
    export S3_ENDPOINT_URL=localhost:8333
    export S3_ACCESS_KEY=some_access_key1
    export S3_SECRET_KEY=some_secret_key1
    export BUCKET_NAME=test-parquet-bucket

    # Run the script
    python3 example_pyarrow_native.py
    
    # Or run with uv (if available)
    uv run example_pyarrow_native.py
"""

import os
import secrets

import pyarrow as pa
import pyarrow.dataset as pads
import pyarrow.fs as pafs
import pyarrow.parquet as pq

from parquet_test_utils import create_sample_table

# Configuration
BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "localhost:8333")
S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY", "some_access_key1")
S3_SECRET_KEY = os.getenv("S3_SECRET_KEY", "some_secret_key1")

# Determine scheme from endpoint
if S3_ENDPOINT_URL.startswith("http://"):
    scheme = "http"
    endpoint = S3_ENDPOINT_URL[7:]
elif S3_ENDPOINT_URL.startswith("https://"):
    scheme = "https"
    endpoint = S3_ENDPOINT_URL[8:]
else:
    scheme = "http"  # Default to http for localhost
    endpoint = S3_ENDPOINT_URL

print(f"Connecting to S3 endpoint: {scheme}://{endpoint}")

# Initialize PyArrow's NATIVE S3 filesystem
s3 = pafs.S3FileSystem(
    access_key=S3_ACCESS_KEY,
    secret_key=S3_SECRET_KEY,
    endpoint_override=endpoint,
    scheme=scheme,
    allow_bucket_creation=True,
    allow_bucket_deletion=True,
)

print("✓ Connected to S3 endpoint")


# Create bucket if needed (using boto3)
try:
    import boto3
    from botocore.exceptions import ClientError
    
    s3_client = boto3.client(
        's3',
        endpoint_url=f"{scheme}://{endpoint}",
        aws_access_key_id=S3_ACCESS_KEY,
        aws_secret_access_key=S3_SECRET_KEY,
        region_name='us-east-1',
    )
    
    try:
        s3_client.head_bucket(Bucket=BUCKET_NAME)
        print(f"✓ Bucket exists: {BUCKET_NAME}")
    except ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"Creating bucket: {BUCKET_NAME}")
            s3_client.create_bucket(Bucket=BUCKET_NAME)
            print(f"✓ Bucket created: {BUCKET_NAME}")
        else:
            raise
except ImportError:
    print("Warning: boto3 not available, assuming bucket exists")

# Generate a unique filename
filename = f"{BUCKET_NAME}/dataset-{secrets.token_hex(8)}/test.parquet"

print(f"\nWriting Parquet dataset to: {filename}")

# Write dataset
table = create_sample_table(200_000)
pads.write_dataset(
    table,
    filename,
    filesystem=s3,
    format="parquet",
)

print(f"✓ Wrote {table.num_rows:,} rows")

# Read with pq.read_table
print("\nReading with pq.read_table...")
table_read = pq.read_table(filename, filesystem=s3)
print(f"✓ Read {table_read.num_rows:,} rows")

# Read with pq.ParquetDataset
print("\nReading with pq.ParquetDataset...")
dataset = pq.ParquetDataset(filename, filesystem=s3)
table_dataset = dataset.read()
print(f"✓ Read {table_dataset.num_rows:,} rows")

# Read with pads.dataset
print("\nReading with pads.dataset...")
dataset_pads = pads.dataset(filename, filesystem=s3)
table_pads = dataset_pads.to_table()
print(f"✓ Read {table_pads.num_rows:,} rows")

print("\n✅ All operations completed successfully!")
print(f"\nFile written to: {filename}")
print("You can verify the file using the SeaweedFS S3 API or weed shell")