seaweedfs/test/s3/parquet/test_pyarrow_native_s3.py


								#!/usr/bin/env python3

								"""

								Test script for PyArrow's NATIVE S3 filesystem with SeaweedFS.


								This test uses PyArrow's built-in S3FileSystem (pyarrow.fs.S3FileSystem)

								instead of s3fs, providing a pure PyArrow solution for reading and writing

								Parquet files to S3-compatible storage.


								Requirements:

								    - pyarrow>=10.0.0


								Environment Variables:

								    S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333)

								    S3_ACCESS_KEY: S3 access key (default: some_access_key1)

								    S3_SECRET_KEY: S3 secret key (default: some_secret_key1)

								    BUCKET_NAME: S3 bucket name (default: test-parquet-bucket)

								    TEST_QUICK: Run only small/quick tests (default: 0, set to 1 for quick mode)


								Usage:

								    # Run with default environment variables

								    python3 test_pyarrow_native_s3.py


								    # Run with custom environment variables

								    S3_ENDPOINT_URL=localhost:8333 \

								    S3_ACCESS_KEY=mykey \

								    S3_SECRET_KEY=mysecret \

								    BUCKET_NAME=mybucket \

								    python3 test_pyarrow_native_s3.py

								"""


								import os

								import secrets

								import sys

								import logging

								from datetime import datetime

								from typing import Optional


								import pyarrow as pa

								import pyarrow.dataset as pads

								import pyarrow.fs as pafs

								import pyarrow.parquet as pq


								try:

								    import boto3

								    from botocore.exceptions import ClientError

								    HAS_BOTO3 = True

								except ImportError:

								    HAS_BOTO3 = False


								logging.basicConfig(level=logging.INFO, format="%(message)s")


								# Configuration from environment variables with defaults

								S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333")

								S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")

								S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")

								BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")

								TEST_QUICK = os.getenv("TEST_QUICK", "0") == "1"


								# Create randomized test directory

								TEST_RUN_ID = secrets.token_hex(8)

								TEST_DIR = f"parquet-native-tests/{TEST_RUN_ID}"


								# Test file sizes

								TEST_SIZES = {

								    "small": 5,

								    "large": 200_000,  # This will create multiple row groups

								}


								# Filter to only small tests if quick mode is enabled

								if TEST_QUICK:

								    TEST_SIZES = {"small": TEST_SIZES["small"]}

								    logging.info("Quick test mode enabled - running only small tests")


								def create_sample_table(num_rows: int = 5) -> pa.Table:

								    """Create a sample PyArrow table for testing."""

								    return pa.table(

								        {

								            "id": pa.array(range(num_rows), type=pa.int64()),

								            "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),

								            "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),

								            "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),

								        }

								    )


								def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:

								    """Initialize PyArrow's native S3 filesystem.


								    Returns:

								        tuple: (S3FileSystem instance, scheme, endpoint)

								    """

								    try:

								        logging.info("Initializing PyArrow S3FileSystem...")

								        logging.info(f"  Endpoint: {S3_ENDPOINT_URL}")

								        logging.info(f"  Bucket: {BUCKET_NAME}")


								        # Determine scheme from endpoint

								        if S3_ENDPOINT_URL.startswith("http://"):

								            scheme = "http"

								            endpoint = S3_ENDPOINT_URL[7:]  # Remove http://

								        elif S3_ENDPOINT_URL.startswith("https://"):

								            scheme = "https"

								            endpoint = S3_ENDPOINT_URL[8:]  # Remove https://

								        else:

								            # Default to http for localhost

								            scheme = "http"

								            endpoint = S3_ENDPOINT_URL


								        # Enable bucket creation and deletion for testing

								        s3 = pafs.S3FileSystem(

								            access_key=S3_ACCESS_KEY,

								            secret_key=S3_SECRET_KEY,

								            endpoint_override=endpoint,

								            scheme=scheme,

								            allow_bucket_creation=True,

								            allow_bucket_deletion=True,

								        )


								        logging.info("✓ PyArrow S3FileSystem initialized successfully\n")

								        return s3, scheme, endpoint

								    except Exception as e:

								        logging.exception("✗ Failed to initialize PyArrow S3FileSystem")

								        return None, "", ""


								def ensure_bucket_exists_boto3(scheme: str, endpoint: str) -> bool:

								    """Ensure the test bucket exists using boto3."""

								    if not HAS_BOTO3:

								        logging.error("boto3 is required for bucket creation")

								        return False


								    try:

								        # Create boto3 client

								        endpoint_url = f"{scheme}://{endpoint}"

								        s3_client = boto3.client(

								            's3',

								            endpoint_url=endpoint_url,

								            aws_access_key_id=S3_ACCESS_KEY,

								            aws_secret_access_key=S3_SECRET_KEY,

								            region_name='us-east-1',

								        )


								        # Check if bucket exists

								        try:

								            s3_client.head_bucket(Bucket=BUCKET_NAME)

								            logging.info(f"✓ Bucket exists: {BUCKET_NAME}")

								            return True

								        except ClientError as e:

								            error_code = e.response['Error']['Code']

								            if error_code == '404':

								                # Bucket doesn't exist, create it

								                logging.info(f"Creating bucket: {BUCKET_NAME}")

								                s3_client.create_bucket(Bucket=BUCKET_NAME)

								                logging.info(f"✓ Bucket created: {BUCKET_NAME}")

								                return True

								            else:

								                raise

								    except Exception as e:

								        logging.exception(f"✗ Failed to create/check bucket: {e}")

								        return False


								def ensure_bucket_exists(s3: pafs.S3FileSystem) -> bool:

								    """Ensure the test bucket exists using PyArrow's native S3FileSystem."""

								    try:

								        # Check if bucket exists by trying to list it

								        try:

								            file_info = s3.get_file_info(BUCKET_NAME)

								            if file_info.type == pafs.FileType.Directory:

								                logging.info(f"✓ Bucket exists: {BUCKET_NAME}")

								                return True

								        except Exception:

								            pass


								        # Try to create the bucket

								        logging.info(f"Creating bucket: {BUCKET_NAME}")

								        s3.create_dir(BUCKET_NAME)

								        logging.info(f"✓ Bucket created: {BUCKET_NAME}")

								        return True

								    except Exception as e:

								        logging.error(f"✗ Failed to create/check bucket with PyArrow: {e}")

								        return False


								def test_write_and_read(s3: pafs.S3FileSystem, test_name: str, num_rows: int) -> tuple[bool, str]:

								    """Test writing and reading a Parquet dataset using PyArrow's native S3 filesystem."""

								    try:

								        table = create_sample_table(num_rows)


								        # Write using pads.write_dataset

								        filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet"

								        logging.info(f"  Writing {num_rows:,} rows to {filename}...")


								        pads.write_dataset(

								            table,

								            filename,

								            filesystem=s3,

								            format="parquet",

								        )

								        logging.info(f"  ✓ Write completed")


								        # Test Method 1: Read with pq.read_table

								        logging.info(f"  Reading with pq.read_table...")

								        table_read = pq.read_table(filename, filesystem=s3)

								        if table_read.num_rows != num_rows:

								            return False, f"pq.read_table: Row count mismatch (expected {num_rows}, got {table_read.num_rows})"

								        logging.info(f"  ✓ pq.read_table: {table_read.num_rows:,} rows")


								        # Test Method 2: Read with pq.ParquetDataset

								        logging.info(f"  Reading with pq.ParquetDataset...")

								        dataset = pq.ParquetDataset(filename, filesystem=s3)

								        table_dataset = dataset.read()

								        if table_dataset.num_rows != num_rows:

								            return False, f"pq.ParquetDataset: Row count mismatch (expected {num_rows}, got {table_dataset.num_rows})"

								        logging.info(f"  ✓ pq.ParquetDataset: {table_dataset.num_rows:,} rows")


								        # Test Method 3: Read with pads.dataset

								        logging.info(f"  Reading with pads.dataset...")

								        dataset_pads = pads.dataset(filename, filesystem=s3)

								        table_pads = dataset_pads.to_table()

								        if table_pads.num_rows != num_rows:

								            return False, f"pads.dataset: Row count mismatch (expected {num_rows}, got {table_pads.num_rows})"

								        logging.info(f"  ✓ pads.dataset: {table_pads.num_rows:,} rows")


								        return True, "All read methods passed"


								    except Exception as e:

								        logging.exception(f"  ✗ Test failed: {e}")

								        return False, f"{type(e).__name__}: {str(e)}"


								def cleanup_test_files(s3: pafs.S3FileSystem) -> None:

								    """Clean up test files from S3."""

								    try:

								        test_path = f"{BUCKET_NAME}/{TEST_DIR}"

								        logging.info(f"Cleaning up test directory: {test_path}")


								        # Delete all files in the test directory

								        file_info = s3.get_file_info(pafs.FileSelector(test_path, recursive=True))

								        for info in file_info:

								            if info.type == pafs.FileType.File:

								                s3.delete_file(info.path)


								        logging.info("✓ Test directory cleaned up")

								    except Exception as e:

								        logging.warning(f"Failed to cleanup test directory: {e}")


								def main():

								    """Run all tests with PyArrow's native S3 filesystem."""

								    print("=" * 80)

								    print("PyArrow Native S3 Filesystem Tests for SeaweedFS")

								    print("Testing Parquet Files with Multiple Row Groups")

								    if TEST_QUICK:

								        print("*** QUICK TEST MODE - Small files only ***")

								    print("=" * 80 + "\n")


								    print("Configuration:")

								    print(f"  S3 Endpoint: {S3_ENDPOINT_URL}")

								    print(f"  Access Key: {S3_ACCESS_KEY}")

								    print(f"  Bucket: {BUCKET_NAME}")

								    print(f"  Test Directory: {TEST_DIR}")

								    print(f"  Quick Mode: {'Yes (small files only)' if TEST_QUICK else 'No (all file sizes)'}")

								    print(f"  PyArrow Version: {pa.__version__}")

								    print()


								    # Initialize S3 filesystem

								    s3, scheme, endpoint = init_s3_filesystem()

								    if s3 is None:

								        print("Cannot proceed without S3 connection")

								        return 1


								    # Ensure bucket exists - try PyArrow first, fall back to boto3

								    bucket_created = ensure_bucket_exists(s3)

								    if not bucket_created:

								        logging.info("Trying to create bucket with boto3...")

								        bucket_created = ensure_bucket_exists_boto3(scheme, endpoint)


								    if not bucket_created:

								        print("Cannot proceed without bucket")

								        return 1


								    results = []


								    # Test all file sizes

								    for size_name, num_rows in TEST_SIZES.items():

								        print(f"\n{'='*80}")

								        print(f"Testing with {size_name} files ({num_rows:,} rows)")

								        print(f"{'='*80}\n")


								        test_name = f"{size_name}_test"

								        success, message = test_write_and_read(s3, test_name, num_rows)

								        results.append((test_name, success, message))


								        status = "✓ PASS" if success else "✗ FAIL"

								        print(f"\n{status}: {message}\n")


								    # Summary

								    print("\n" + "=" * 80)

								    print("SUMMARY")

								    print("=" * 80)

								    passed = sum(1 for _, success, _ in results if success)

								    total = len(results)

								    print(f"\nTotal: {passed}/{total} passed\n")


								    for test_name, success, message in results:

								        status = "✓" if success else "✗"

								        print(f"  {status} {test_name}: {message}")


								    print("\n" + "=" * 80)

								    if passed == total:

								        print("✓ ALL TESTS PASSED!")

								    else:

								        print(f"✗ {total - passed} test(s) failed")


								    print("=" * 80 + "\n")


								    # Cleanup

								    cleanup_test_files(s3)


								    return 0 if passed == total else 1


								if __name__ == "__main__":

								    sys.exit(main())