seaweedfs/test/s3/parquet/test_pyarrow_native_s3.py


								#!/usr/bin/env python3

								"""

								Test script for PyArrow's NATIVE S3 filesystem with SeaweedFS.


								This test uses PyArrow's built-in S3FileSystem (pyarrow.fs.S3FileSystem)

								instead of s3fs, providing a pure PyArrow solution for reading and writing

								Parquet files to S3-compatible storage.


								Requirements:

								    - pyarrow>=10.0.0


								Environment Variables:

								    S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333)

								    S3_ACCESS_KEY: S3 access key (default: some_access_key1)

								    S3_SECRET_KEY: S3 secret key (default: some_secret_key1)

								    BUCKET_NAME: S3 bucket name (default: test-parquet-bucket)

								    TEST_QUICK: Run only small/quick tests (default: 0, set to 1 for quick mode)


								Usage:

								    # Run with default environment variables

								    python3 test_pyarrow_native_s3.py


								    # Run with custom environment variables

								    S3_ENDPOINT_URL=localhost:8333 \

								    S3_ACCESS_KEY=mykey \

								    S3_SECRET_KEY=mysecret \

								    BUCKET_NAME=mybucket \

								    python3 test_pyarrow_native_s3.py

								"""


								import os

								import secrets

								import sys

								import logging

								from typing import Optional


								import pyarrow as pa

								import pyarrow.dataset as pads

								import pyarrow.fs as pafs

								import pyarrow.parquet as pq


								try:

								    import boto3

								    from botocore.exceptions import ClientError

								    HAS_BOTO3 = True

								except ImportError:

								    HAS_BOTO3 = False


								from parquet_test_utils import create_sample_table


								logging.basicConfig(level=logging.INFO, format="%(message)s")


								# Configuration from environment variables with defaults

								S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333")

								S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")

								S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")

								BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")

								TEST_QUICK = os.getenv("TEST_QUICK", "0") == "1"


								# Create randomized test directory

								TEST_RUN_ID = secrets.token_hex(8)

								TEST_DIR = f"parquet-native-tests/{TEST_RUN_ID}"


								# Test file sizes

								TEST_SIZES = {

								    "small": 5,

								    "large": 200_000,  # This will create multiple row groups

								}


								# Filter to only small tests if quick mode is enabled

								if TEST_QUICK:

								    TEST_SIZES = {"small": TEST_SIZES["small"]}

								    logging.info("Quick test mode enabled - running only small tests")


								def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:

								    """Initialize PyArrow's native S3 filesystem.


								    Returns:

								        tuple: (S3FileSystem instance, scheme, endpoint)

								    """

								    try:

								        logging.info("Initializing PyArrow S3FileSystem...")

								        logging.info(f"  Endpoint: {S3_ENDPOINT_URL}")

								        logging.info(f"  Bucket: {BUCKET_NAME}")


								        # Determine scheme from endpoint

								        if S3_ENDPOINT_URL.startswith("http://"):

								            scheme = "http"

								            endpoint = S3_ENDPOINT_URL[7:]  # Remove http://

								        elif S3_ENDPOINT_URL.startswith("https://"):

								            scheme = "https"

								            endpoint = S3_ENDPOINT_URL[8:]  # Remove https://

								        else:

								            # Default to http for localhost

								            scheme = "http"

								            endpoint = S3_ENDPOINT_URL


								        # Enable bucket creation and deletion for testing

								        s3 = pafs.S3FileSystem(

								            access_key=S3_ACCESS_KEY,

								            secret_key=S3_SECRET_KEY,

								            endpoint_override=endpoint,

								            scheme=scheme,

								            allow_bucket_creation=True,

								            allow_bucket_deletion=True,

								        )


								        logging.info("✓ PyArrow S3FileSystem initialized successfully\n")

								        return s3, scheme, endpoint

								    except Exception:

								        logging.exception("✗ Failed to initialize PyArrow S3FileSystem")

								        return None, "", ""


								def ensure_bucket_exists_boto3(scheme: str, endpoint: str) -> bool:

								    """Ensure the test bucket exists using boto3."""

								    if not HAS_BOTO3:

								        logging.error("boto3 is required for bucket creation")

								        return False


								    try:

								        # Create boto3 client

								        endpoint_url = f"{scheme}://{endpoint}"

								        s3_client = boto3.client(

								            's3',

								            endpoint_url=endpoint_url,

								            aws_access_key_id=S3_ACCESS_KEY,

								            aws_secret_access_key=S3_SECRET_KEY,

								            region_name='us-east-1',

								        )


								        # Check if bucket exists

								        try:

								            s3_client.head_bucket(Bucket=BUCKET_NAME)

								            logging.info(f"✓ Bucket exists: {BUCKET_NAME}")

								            return True

								        except ClientError as e:

								            error_code = e.response['Error']['Code']

								            if error_code == '404':

								                # Bucket doesn't exist, create it

								                logging.info(f"Creating bucket: {BUCKET_NAME}")

								                s3_client.create_bucket(Bucket=BUCKET_NAME)

								                logging.info(f"✓ Bucket created: {BUCKET_NAME}")

								                return True

								            else:

								                raise

								    except Exception:

								        logging.exception("✗ Failed to create/check bucket")

								        return False


								def ensure_bucket_exists(s3: pafs.S3FileSystem) -> bool:

								    """Ensure the test bucket exists using PyArrow's native S3FileSystem."""

								    try:

								        # Check if bucket exists by trying to list it

								        try:

								            file_info = s3.get_file_info(BUCKET_NAME)

								            if file_info.type == pafs.FileType.Directory:

								                logging.info(f"✓ Bucket exists: {BUCKET_NAME}")

								                return True

								        except OSError as e:

								            # OSError typically means bucket not found or network/permission issues

								            error_msg = str(e).lower()

								            if "not found" in error_msg or "does not exist" in error_msg or "nosuchbucket" in error_msg:

								                logging.debug(f"Bucket '{BUCKET_NAME}' not found, will attempt creation: {e}")

								            else:

								                # Log other OSErrors (network, auth, etc.) for debugging

								                logging.debug(f"Error checking bucket '{BUCKET_NAME}', will attempt creation anyway: {type(e).__name__}: {e}")

								        except Exception as e:

								            # Catch any other unexpected exceptions and log them

								            logging.debug(f"Unexpected error checking bucket '{BUCKET_NAME}', will attempt creation: {type(e).__name__}: {e}")


								        # Try to create the bucket

								        logging.info(f"Creating bucket: {BUCKET_NAME}")

								        s3.create_dir(BUCKET_NAME)

								        logging.info(f"✓ Bucket created: {BUCKET_NAME}")

								        return True

								    except Exception:

								        logging.exception(f"✗ Failed to create/check bucket '{BUCKET_NAME}' with PyArrow")

								        return False


								def test_write_and_read(s3: pafs.S3FileSystem, test_name: str, num_rows: int) -> tuple[bool, str]:

								    """Test writing and reading a Parquet dataset using PyArrow's native S3 filesystem."""

								    try:

								        table = create_sample_table(num_rows)


								        # Write using pads.write_dataset

								        filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet"

								        logging.info(f"  Writing {num_rows:,} rows to {filename}...")


								        pads.write_dataset(

								            table,

								            filename,

								            filesystem=s3,

								            format="parquet",

								        )

								        logging.info("  ✓ Write completed")


								        # Test Method 1: Read with pq.read_table

								        logging.info("  Reading with pq.read_table...")

								        table_read = pq.read_table(filename, filesystem=s3)

								        if table_read.num_rows != num_rows:

								            return False, f"pq.read_table: Row count mismatch (expected {num_rows}, got {table_read.num_rows})"


								        # Check schema first

								        if not table_read.schema.equals(table.schema):

								            return False, f"pq.read_table: Schema mismatch (expected {table.schema}, got {table_read.schema})"


								        # Sort both tables by 'id' column before comparison to handle potential row order differences

								        table_sorted = table.sort_by([('id', 'ascending')])

								        table_read_sorted = table_read.sort_by([('id', 'ascending')])


								        if not table_read_sorted.equals(table_sorted):

								            # Provide more detailed error information

								            error_details = []

								            for col_name in table.column_names:

								                col_original = table_sorted.column(col_name)

								                col_read = table_read_sorted.column(col_name)

								                if not col_original.equals(col_read):

								                    error_details.append(f"column '{col_name}' differs")

								            return False, f"pq.read_table: Table contents mismatch ({', '.join(error_details)})"

								        logging.info(f"  ✓ pq.read_table: {table_read.num_rows:,} rows")


								        # Test Method 2: Read with pq.ParquetDataset

								        logging.info("  Reading with pq.ParquetDataset...")

								        dataset = pq.ParquetDataset(filename, filesystem=s3)

								        table_dataset = dataset.read()

								        if table_dataset.num_rows != num_rows:

								            return False, f"pq.ParquetDataset: Row count mismatch (expected {num_rows}, got {table_dataset.num_rows})"


								        # Sort before comparison

								        table_dataset_sorted = table_dataset.sort_by([('id', 'ascending')])

								        if not table_dataset_sorted.equals(table_sorted):

								            error_details = []

								            for col_name in table.column_names:

								                col_original = table_sorted.column(col_name)

								                col_read = table_dataset_sorted.column(col_name)

								                if not col_original.equals(col_read):

								                    error_details.append(f"column '{col_name}' differs")

								            return False, f"pq.ParquetDataset: Table contents mismatch ({', '.join(error_details)})"

								        logging.info(f"  ✓ pq.ParquetDataset: {table_dataset.num_rows:,} rows")


								        # Test Method 3: Read with pads.dataset

								        logging.info("  Reading with pads.dataset...")

								        dataset_pads = pads.dataset(filename, filesystem=s3)

								        table_pads = dataset_pads.to_table()

								        if table_pads.num_rows != num_rows:

								            return False, f"pads.dataset: Row count mismatch (expected {num_rows}, got {table_pads.num_rows})"


								        # Sort before comparison

								        table_pads_sorted = table_pads.sort_by([('id', 'ascending')])

								        if not table_pads_sorted.equals(table_sorted):

								            error_details = []

								            for col_name in table.column_names:

								                col_original = table_sorted.column(col_name)

								                col_read = table_pads_sorted.column(col_name)

								                if not col_original.equals(col_read):

								                    error_details.append(f"column '{col_name}' differs")

								            return False, f"pads.dataset: Table contents mismatch ({', '.join(error_details)})"

								        logging.info(f"  ✓ pads.dataset: {table_pads.num_rows:,} rows")


								        return True, "All read methods passed"


								    except Exception as exc:

								        logging.exception("  ✗ Test failed")

								        return False, f"{type(exc).__name__}: {exc}"


								def cleanup_test_files(s3: pafs.S3FileSystem) -> None:

								    """Clean up test files from S3.


								    Note: We cannot use s3.delete_dir() directly because SeaweedFS uses implicit

								    directories (path prefixes without physical directory objects). PyArrow's

								    delete_dir() attempts to delete the directory marker itself, which fails with

								    "INTERNAL_FAILURE" on SeaweedFS. Instead, we list and delete files individually,

								    letting implicit directories disappear automatically.

								    """

								    try:

								        test_path = f"{BUCKET_NAME}/{TEST_DIR}"

								        logging.info(f"Cleaning up test directory: {test_path}")


								        # List and delete files individually to handle implicit directories

								        try:

								            file_selector = pafs.FileSelector(test_path, recursive=True)

								            files = s3.get_file_info(file_selector)


								            # Delete files first (not directories)

								            for file_info in files:

								                if file_info.type == pafs.FileType.File:

								                    s3.delete_file(file_info.path)

								                    logging.debug(f"  Deleted file: {file_info.path}")


								            logging.info("✓ Test directory cleaned up")

								        except OSError as e:

								            # Handle the case where the path doesn't exist or is inaccessible

								            if "does not exist" in str(e).lower() or "not found" in str(e).lower():

								                logging.info("✓ Test directory already clean or doesn't exist")

								            else:

								                raise

								    except Exception:

								        logging.exception("Failed to cleanup test directory")


								def main():

								    """Run all tests with PyArrow's native S3 filesystem."""

								    print("=" * 80)

								    print("PyArrow Native S3 Filesystem Tests for SeaweedFS")

								    print("Testing Parquet Files with Multiple Row Groups")

								    if TEST_QUICK:

								        print("*** QUICK TEST MODE - Small files only ***")

								    print("=" * 80 + "\n")


								    print("Configuration:")

								    print(f"  S3 Endpoint: {S3_ENDPOINT_URL}")

								    print(f"  Access Key: {S3_ACCESS_KEY}")

								    print(f"  Bucket: {BUCKET_NAME}")

								    print(f"  Test Directory: {TEST_DIR}")

								    print(f"  Quick Mode: {'Yes (small files only)' if TEST_QUICK else 'No (all file sizes)'}")

								    print(f"  PyArrow Version: {pa.__version__}")

								    print()


								    # Initialize S3 filesystem

								    s3, scheme, endpoint = init_s3_filesystem()

								    if s3 is None:

								        print("Cannot proceed without S3 connection")

								        return 1


								    # Ensure bucket exists - try PyArrow first, fall back to boto3

								    bucket_created = ensure_bucket_exists(s3)

								    if not bucket_created:

								        logging.info("Trying to create bucket with boto3...")

								        bucket_created = ensure_bucket_exists_boto3(scheme, endpoint)


								    if not bucket_created:

								        print("Cannot proceed without bucket")

								        return 1


								    results = []


								    # Test all file sizes

								    for size_name, num_rows in TEST_SIZES.items():

								        print(f"\n{'='*80}")

								        print(f"Testing with {size_name} files ({num_rows:,} rows)")

								        print(f"{'='*80}\n")


								        test_name = f"{size_name}_test"

								        success, message = test_write_and_read(s3, test_name, num_rows)

								        results.append((test_name, success, message))


								        status = "✓ PASS" if success else "✗ FAIL"

								        print(f"\n{status}: {message}\n")


								    # Summary

								    print("\n" + "=" * 80)

								    print("SUMMARY")

								    print("=" * 80)

								    passed = sum(1 for _, success, _ in results if success)

								    total = len(results)

								    print(f"\nTotal: {passed}/{total} passed\n")


								    for test_name, success, message in results:

								        status = "✓" if success else "✗"

								        print(f"  {status} {test_name}: {message}")


								    print("\n" + "=" * 80)

								    if passed == total:

								        print("✓ ALL TESTS PASSED!")

								    else:

								        print(f"✗ {total - passed} test(s) failed")


								    print("=" * 80 + "\n")


								    # Cleanup

								    cleanup_test_files(s3)


								    return 0 if passed == total else 1


								if __name__ == "__main__":

								    sys.exit(main())