#!/usr/bin/env python3
"""
Test script for PyArrow's NATIVE S3 filesystem with SeaweedFS.

This test uses PyArrow's built-in S3FileSystem (pyarrow.fs.S3FileSystem)
instead of s3fs, providing a pure PyArrow solution for reading and writing
Parquet files to S3-compatible storage.

Requirements:
    - pyarrow>=10.0.0

Environment Variables:
    S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333)
    S3_ACCESS_KEY: S3 access key (default: some_access_key1)
    S3_SECRET_KEY: S3 secret key (default: some_secret_key1)
    BUCKET_NAME: S3 bucket name (default: test-parquet-bucket)
    TEST_QUICK: Run only small/quick tests (default: 0, set to 1 for quick mode)

Usage:
    # Run with default environment variables
    python3 test_pyarrow_native_s3.py

    # Run with custom environment variables
    S3_ENDPOINT_URL=localhost:8333 \
    S3_ACCESS_KEY=mykey \
    S3_SECRET_KEY=mysecret \
    BUCKET_NAME=mybucket \
    python3 test_pyarrow_native_s3.py
"""

import os
import secrets
import sys
import logging
from datetime import datetime
from typing import Optional

import pyarrow as pa
import pyarrow.dataset as pads
import pyarrow.fs as pafs
import pyarrow.parquet as pq

try:
    import boto3
    from botocore.exceptions import ClientError
    HAS_BOTO3 = True
except ImportError:
    HAS_BOTO3 = False

logging.basicConfig(level=logging.INFO, format="%(message)s")

# Configuration from environment variables with defaults
S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333")
S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")
S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")
BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
TEST_QUICK = os.getenv("TEST_QUICK", "0") == "1"

# Create randomized test directory
TEST_RUN_ID = secrets.token_hex(8)
TEST_DIR = f"parquet-native-tests/{TEST_RUN_ID}"

# Test file sizes
TEST_SIZES = {
    "small": 5,
    "large": 200_000,  # This will create multiple row groups
}

# Filter to only small tests if quick mode is enabled
if TEST_QUICK:
    TEST_SIZES = {"small": TEST_SIZES["small"]}
    logging.info("Quick test mode enabled - running only small tests")


def create_sample_table(num_rows: int = 5) -> pa.Table:
    """Create a sample PyArrow table for testing."""
    return pa.table(
        {
            "id": pa.array(range(num_rows), type=pa.int64()),
            "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),
            "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
            "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),
        }
    )


def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:
    """Initialize PyArrow's native S3 filesystem.
    
    Returns:
        tuple: (S3FileSystem instance, scheme, endpoint)
    """
    try:
        logging.info("Initializing PyArrow S3FileSystem...")
        logging.info(f"  Endpoint: {S3_ENDPOINT_URL}")
        logging.info(f"  Bucket: {BUCKET_NAME}")
        
        # Determine scheme from endpoint
        if S3_ENDPOINT_URL.startswith("http://"):
            scheme = "http"
            endpoint = S3_ENDPOINT_URL[7:]  # Remove http://
        elif S3_ENDPOINT_URL.startswith("https://"):
            scheme = "https"
            endpoint = S3_ENDPOINT_URL[8:]  # Remove https://
        else:
            # Default to http for localhost
            scheme = "http"
            endpoint = S3_ENDPOINT_URL
        
        # Enable bucket creation and deletion for testing
        s3 = pafs.S3FileSystem(
            access_key=S3_ACCESS_KEY,
            secret_key=S3_SECRET_KEY,
            endpoint_override=endpoint,
            scheme=scheme,
            allow_bucket_creation=True,
            allow_bucket_deletion=True,
        )
        
        logging.info("✓ PyArrow S3FileSystem initialized successfully\n")
        return s3, scheme, endpoint
    except Exception as e:
        logging.exception("✗ Failed to initialize PyArrow S3FileSystem")
        return None, "", ""


def ensure_bucket_exists_boto3(scheme: str, endpoint: str) -> bool:
    """Ensure the test bucket exists using boto3."""
    if not HAS_BOTO3:
        logging.error("boto3 is required for bucket creation")
        return False
    
    try:
        # Create boto3 client
        endpoint_url = f"{scheme}://{endpoint}"
        s3_client = boto3.client(
            's3',
            endpoint_url=endpoint_url,
            aws_access_key_id=S3_ACCESS_KEY,
            aws_secret_access_key=S3_SECRET_KEY,
            region_name='us-east-1',
        )
        
        # Check if bucket exists
        try:
            s3_client.head_bucket(Bucket=BUCKET_NAME)
            logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
            return True
        except ClientError as e:
            error_code = e.response['Error']['Code']
            if error_code == '404':
                # Bucket doesn't exist, create it
                logging.info(f"Creating bucket: {BUCKET_NAME}")
                s3_client.create_bucket(Bucket=BUCKET_NAME)
                logging.info(f"✓ Bucket created: {BUCKET_NAME}")
                return True
            else:
                raise
    except Exception as e:
        logging.exception(f"✗ Failed to create/check bucket: {e}")
        return False


def ensure_bucket_exists(s3: pafs.S3FileSystem) -> bool:
    """Ensure the test bucket exists using PyArrow's native S3FileSystem."""
    try:
        # Check if bucket exists by trying to list it
        try:
            file_info = s3.get_file_info(BUCKET_NAME)
            if file_info.type == pafs.FileType.Directory:
                logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
                return True
        except Exception:
            pass
        
        # Try to create the bucket
        logging.info(f"Creating bucket: {BUCKET_NAME}")
        s3.create_dir(BUCKET_NAME)
        logging.info(f"✓ Bucket created: {BUCKET_NAME}")
        return True
    except Exception as e:
        logging.error(f"✗ Failed to create/check bucket with PyArrow: {e}")
        return False


def test_write_and_read(s3: pafs.S3FileSystem, test_name: str, num_rows: int) -> tuple[bool, str]:
    """Test writing and reading a Parquet dataset using PyArrow's native S3 filesystem."""
    try:
        table = create_sample_table(num_rows)
        
        # Write using pads.write_dataset
        filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet"
        logging.info(f"  Writing {num_rows:,} rows to {filename}...")
        
        pads.write_dataset(
            table,
            filename,
            filesystem=s3,
            format="parquet",
        )
        logging.info(f"  ✓ Write completed")
        
        # Test Method 1: Read with pq.read_table
        logging.info(f"  Reading with pq.read_table...")
        table_read = pq.read_table(filename, filesystem=s3)
        if table_read.num_rows != num_rows:
            return False, f"pq.read_table: Row count mismatch (expected {num_rows}, got {table_read.num_rows})"
        logging.info(f"  ✓ pq.read_table: {table_read.num_rows:,} rows")
        
        # Test Method 2: Read with pq.ParquetDataset
        logging.info(f"  Reading with pq.ParquetDataset...")
        dataset = pq.ParquetDataset(filename, filesystem=s3)
        table_dataset = dataset.read()
        if table_dataset.num_rows != num_rows:
            return False, f"pq.ParquetDataset: Row count mismatch (expected {num_rows}, got {table_dataset.num_rows})"
        logging.info(f"  ✓ pq.ParquetDataset: {table_dataset.num_rows:,} rows")
        
        # Test Method 3: Read with pads.dataset
        logging.info(f"  Reading with pads.dataset...")
        dataset_pads = pads.dataset(filename, filesystem=s3)
        table_pads = dataset_pads.to_table()
        if table_pads.num_rows != num_rows:
            return False, f"pads.dataset: Row count mismatch (expected {num_rows}, got {table_pads.num_rows})"
        logging.info(f"  ✓ pads.dataset: {table_pads.num_rows:,} rows")
        
        return True, "All read methods passed"
        
    except Exception as e:
        logging.exception(f"  ✗ Test failed: {e}")
        return False, f"{type(e).__name__}: {str(e)}"


def cleanup_test_files(s3: pafs.S3FileSystem) -> None:
    """Clean up test files from S3."""
    try:
        test_path = f"{BUCKET_NAME}/{TEST_DIR}"
        logging.info(f"Cleaning up test directory: {test_path}")
        
        # Delete all files in the test directory
        file_info = s3.get_file_info(pafs.FileSelector(test_path, recursive=True))
        for info in file_info:
            if info.type == pafs.FileType.File:
                s3.delete_file(info.path)
        
        logging.info("✓ Test directory cleaned up")
    except Exception as e:
        logging.warning(f"Failed to cleanup test directory: {e}")


def main():
    """Run all tests with PyArrow's native S3 filesystem."""
    print("=" * 80)
    print("PyArrow Native S3 Filesystem Tests for SeaweedFS")
    print("Testing Parquet Files with Multiple Row Groups")
    if TEST_QUICK:
        print("*** QUICK TEST MODE - Small files only ***")
    print("=" * 80 + "\n")

    print("Configuration:")
    print(f"  S3 Endpoint: {S3_ENDPOINT_URL}")
    print(f"  Access Key: {S3_ACCESS_KEY}")
    print(f"  Bucket: {BUCKET_NAME}")
    print(f"  Test Directory: {TEST_DIR}")
    print(f"  Quick Mode: {'Yes (small files only)' if TEST_QUICK else 'No (all file sizes)'}")
    print(f"  PyArrow Version: {pa.__version__}")
    print()

    # Initialize S3 filesystem
    s3, scheme, endpoint = init_s3_filesystem()
    if s3 is None:
        print("Cannot proceed without S3 connection")
        return 1

    # Ensure bucket exists - try PyArrow first, fall back to boto3
    bucket_created = ensure_bucket_exists(s3)
    if not bucket_created:
        logging.info("Trying to create bucket with boto3...")
        bucket_created = ensure_bucket_exists_boto3(scheme, endpoint)
    
    if not bucket_created:
        print("Cannot proceed without bucket")
        return 1

    results = []

    # Test all file sizes
    for size_name, num_rows in TEST_SIZES.items():
        print(f"\n{'='*80}")
        print(f"Testing with {size_name} files ({num_rows:,} rows)")
        print(f"{'='*80}\n")

        test_name = f"{size_name}_test"
        success, message = test_write_and_read(s3, test_name, num_rows)
        results.append((test_name, success, message))
        
        status = "✓ PASS" if success else "✗ FAIL"
        print(f"\n{status}: {message}\n")

    # Summary
    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    passed = sum(1 for _, success, _ in results if success)
    total = len(results)
    print(f"\nTotal: {passed}/{total} passed\n")

    for test_name, success, message in results:
        status = "✓" if success else "✗"
        print(f"  {status} {test_name}: {message}")

    print("\n" + "=" * 80)
    if passed == total:
        print("✓ ALL TESTS PASSED!")
    else:
        print(f"✗ {total - passed} test(s) failed")

    print("=" * 80 + "\n")

    # Cleanup
    cleanup_test_files(s3)

    return 0 if passed == total else 1


if __name__ == "__main__":
    sys.exit(main())