S3: Add tests for PyArrow with native S3 filesystem (#7508)

* PyArrow native S3 filesystem * add sse-s3 tests * update * minor * ENABLE_SSE_S3 * Update test_pyarrow_native_s3.py * clean up * refactoring * Update test_pyarrow_native_s3.py
6 days ago · 8be9e258fc
7 changed files with 1008 additions and 5 deletions
--- a/.github/workflows/s3-parquet-tests.yml
+++ b/.github/workflows/s3-parquet-tests.yml
@ -86,6 +86,28 @@ jobs:
          VOLUME_PORT: 8080
          MASTER_PORT: 9333
      - name: Run PyArrow native S3 filesystem tests
        run: |
          cd test/s3/parquet
          make test-native-s3-with-server
        env:
          SEAWEEDFS_BINARY: weed
          S3_PORT: 8333
          FILER_PORT: 8888
          VOLUME_PORT: 8080
          MASTER_PORT: 9333
      - name: Run SSE-S3 encryption compatibility tests
        run: |
          cd test/s3/parquet
          make test-sse-s3-compat
        env:
          SEAWEEDFS_BINARY: weed
          S3_PORT: 8333
          FILER_PORT: 8888
          VOLUME_PORT: 8080
          MASTER_PORT: 9333
      - name: Upload test logs on failure
        if: failure()
        uses: actions/upload-artifact@v4
--- a/test/s3/parquet/Makefile
+++ b/test/s3/parquet/Makefile
@ -13,6 +13,7 @@ SECRET_KEY ?= some_secret_key1
 VOLUME_MAX_SIZE_MB ?= 50
 VOLUME_MAX_COUNT ?= 100
 BUCKET_NAME ?= test-parquet-bucket
 ENABLE_SSE_S3 ?= false
 # Python configuration
 PYTHON ?= python3
@ -29,7 +30,7 @@ GREEN := \033[0;32m
 YELLOW := \033[1;33m
 NC := \033[0m # No Color
 .PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-quick test-with-server
 .PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-native-s3 test-native-s3-with-server test-native-s3-with-sse test-quick test-sse-s3-compat test-with-server
 all: test
@ -48,6 +49,10 @@ help:
 	@echo "  test-quick        - Run quick tests with small files only (sets TEST_QUICK=1)"
 	@echo "  test-implicit-dir - Test implicit directory fix for s3fs compatibility"
 	@echo "  test-implicit-dir-with-server - Test implicit directory fix with server management"
 	@echo "  test-native-s3    - Test PyArrow's native S3 filesystem (assumes server running)"
 	@echo "  test-native-s3-with-server - Test PyArrow's native S3 filesystem with server management"
 	@echo "  test-native-s3-with-sse - Test PyArrow's native S3 with SSE-S3 encryption enabled"
 	@echo "  test-sse-s3-compat - Comprehensive SSE-S3 compatibility test (multipart uploads)"
 	@echo "  setup-python      - Setup Python virtual environment and install dependencies"
 	@echo "  check-python      - Check if Python and required packages are available"
 	@echo "  start-seaweedfs   - Start SeaweedFS server for testing"
@ -66,6 +71,7 @@ help:
 	@echo "  MASTER_PORT=$(MASTER_PORT)"
 	@echo "  BUCKET_NAME=$(BUCKET_NAME)"
 	@echo "  VOLUME_MAX_SIZE_MB=$(VOLUME_MAX_SIZE_MB)"
 	@echo "  ENABLE_SSE_S3=$(ENABLE_SSE_S3)"
 	@echo "  PYTHON=$(PYTHON)"
 check-binary:
@ -131,7 +137,13 @@ start-seaweedfs-ci: check-binary
 	# Start filer server with embedded S3
 	@echo "Starting filer server with embedded S3..."
 	@printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json
 	@if [ "$(ENABLE_SSE_S3)" = "true" ]; then \
 		echo "  SSE-S3 encryption: ENABLED"; \
 		printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}],"buckets":[{"name":"$(BUCKET_NAME)","encryption":{"sseS3":{"enabled":true}}}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json; \
 	else \
 		echo "  SSE-S3 encryption: DISABLED"; \
 		printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json; \
 	fi
 	@AWS_ACCESS_KEY_ID=$(ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(SECRET_KEY) nohup $(SEAWEEDFS_BINARY) filer -port=$(FILER_PORT) -port.grpc=$$(( $(FILER_PORT) + 10000 )) -master=127.0.0.1:$(MASTER_PORT) -dataCenter=defaultDataCenter -ip=127.0.0.1 -s3 -s3.port=$(S3_PORT) -s3.config=/tmp/seaweedfs-parquet-s3.json > /tmp/seaweedfs-parquet-filer.log 2>&1 &
 	@sleep 5
@ -274,7 +286,6 @@ test-with-server: build-weed setup-python
 		BUCKET_NAME=$(BUCKET_NAME) \
 		$(VENV_DIR)/bin/$(PYTHON) $(PYTHON_TEST_SCRIPT) || exit 1; \
 		echo "✅ All tests completed successfully"; \
 		$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true; \
 	else \
 		echo "❌ Failed to start SeaweedFS cluster"; \
 		echo "=== Server startup logs ==="; \
@ -329,7 +340,6 @@ test-implicit-dir-with-server: build-weed setup-python
 		BUCKET_NAME=test-implicit-dir \
 		$(VENV_DIR)/bin/$(PYTHON) test_implicit_directory_fix.py || exit 1; \
 		echo "✅ All tests completed successfully"; \
 		$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true; \
 	else \
 		echo "❌ Failed to start SeaweedFS cluster"; \
 		echo "=== Server startup logs ==="; \
@ -360,6 +370,80 @@ manual-start: start-seaweedfs
 manual-stop: stop-seaweedfs clean
 # Test PyArrow's native S3 filesystem
 test-native-s3: setup-python
 	@echo "$(YELLOW)Running PyArrow native S3 filesystem tests...$(NC)"
 	@echo "$(YELLOW)Assuming SeaweedFS is already running on localhost:$(S3_PORT)$(NC)"
 	@S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
 	 S3_ACCESS_KEY=$(ACCESS_KEY) \
 	 S3_SECRET_KEY=$(SECRET_KEY) \
 	 BUCKET_NAME=$(BUCKET_NAME) \
 	 $(VENV_DIR)/bin/$(PYTHON) test_pyarrow_native_s3.py
 # Test PyArrow's native S3 filesystem with automatic server management
 test-native-s3-with-server: build-weed setup-python
 	@echo "🚀 Starting PyArrow native S3 filesystem tests with automated server management..."
 	@echo "Starting SeaweedFS cluster..."
 	@if $(MAKE) start-seaweedfs-ci > weed-test.log 2>&1; then \
 		echo "✅ SeaweedFS cluster started successfully"; \
 		echo "Running PyArrow native S3 filesystem tests..."; \
 		trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \
 		S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
 		S3_ACCESS_KEY=$(ACCESS_KEY) \
 		S3_SECRET_KEY=$(SECRET_KEY) \
 		BUCKET_NAME=$(BUCKET_NAME) \
 		$(VENV_DIR)/bin/$(PYTHON) test_pyarrow_native_s3.py || exit 1; \
 		echo "✅ All tests completed successfully"; \
 	else \
 		echo "❌ Failed to start SeaweedFS cluster"; \
 		echo "=== Server startup logs ==="; \
 		tail -100 weed-test.log 2>/dev/null || echo "No startup log available"; \
 		exit 1; \
 	fi
 # Test PyArrow's native S3 filesystem compatibility with SSE-S3 enabled backend
 # (For encryption-specific validation, use test-sse-s3-compat)
 test-native-s3-with-sse: build-weed setup-python
 	@echo "🚀 Testing PyArrow native S3 compatibility with SSE-S3 enabled backend..."
 	@echo "Starting SeaweedFS cluster with SSE-S3 enabled..."
 	@if $(MAKE) start-seaweedfs-ci ENABLE_SSE_S3=true > weed-test-sse.log 2>&1; then \
 		echo "✅ SeaweedFS cluster started successfully with SSE-S3"; \
 		echo "Running PyArrow native S3 filesystem tests with SSE-S3..."; \
 		trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \
 		S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
 		S3_ACCESS_KEY=$(ACCESS_KEY) \
 		S3_SECRET_KEY=$(SECRET_KEY) \
 		BUCKET_NAME=$(BUCKET_NAME) \
 		$(VENV_DIR)/bin/$(PYTHON) test_pyarrow_native_s3.py || exit 1; \
 		echo "✅ All SSE-S3 tests completed successfully"; \
 	else \
 		echo "❌ Failed to start SeaweedFS cluster with SSE-S3"; \
 		echo "=== Server startup logs ==="; \
 		tail -100 weed-test-sse.log 2>/dev/null || echo "No startup log available"; \
 		exit 1; \
 	fi
 # Comprehensive SSE-S3 compatibility test
 test-sse-s3-compat: build-weed setup-python
 	@echo "🚀 Starting comprehensive SSE-S3 compatibility tests..."
 	@echo "Starting SeaweedFS cluster with SSE-S3 enabled..."
 	@if $(MAKE) start-seaweedfs-ci ENABLE_SSE_S3=true > weed-test-sse-compat.log 2>&1; then \
 		echo "✅ SeaweedFS cluster started successfully with SSE-S3"; \
 		echo "Running comprehensive SSE-S3 compatibility tests..."; \
 		trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \
 		S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
 		S3_ACCESS_KEY=$(ACCESS_KEY) \
 		S3_SECRET_KEY=$(SECRET_KEY) \
 		BUCKET_NAME=$(BUCKET_NAME) \
 		$(VENV_DIR)/bin/$(PYTHON) test_sse_s3_compatibility.py || exit 1; \
 		echo "✅ All SSE-S3 compatibility tests completed successfully"; \
 	else \
 		echo "❌ Failed to start SeaweedFS cluster with SSE-S3"; \
 		echo "=== Server startup logs ==="; \
 		tail -100 weed-test-sse-compat.log 2>/dev/null || echo "No startup log available"; \
 		exit 1; \
 	fi
 # CI/CD targets
 ci-test: test-with-server
--- a/test/s3/parquet/README.md
+++ b/test/s3/parquet/README.md
@ -10,6 +10,22 @@ SeaweedFS implements implicit directory detection to improve compatibility with
 ## Quick Start
 ### Running the Example Script
 ```bash
 # Start SeaweedFS server
 make start-seaweedfs-ci
 # Run the example script
 python3 example_pyarrow_native.py
 # Or with uv (if available)
 uv run example_pyarrow_native.py
 # Stop the server when done
 make stop-seaweedfs-safe
 ```
 ### Running Tests
 ```bash
@ -25,12 +41,20 @@ make test-quick
 # Run implicit directory fix tests
 make test-implicit-dir-with-server
 # Run PyArrow native S3 filesystem tests
 make test-native-s3-with-server
 # Run SSE-S3 encryption tests
 make test-sse-s3-compat
 # Clean up
 make clean
 ```
 ### Using PyArrow with SeaweedFS
 #### Option 1: Using s3fs (recommended for compatibility)
 ```python
 import pyarrow as pa
 import pyarrow.parquet as pq
@ -55,13 +79,55 @@ table = pq.read_table('bucket/dataset', filesystem=fs)   # ✅
 dataset = pq.ParquetDataset('bucket/dataset', filesystem=fs)  # ✅
 ```
 #### Option 2: Using PyArrow's native S3 filesystem (pure PyArrow)
 ```python
 import pyarrow as pa
 import pyarrow.parquet as pq
 import pyarrow.dataset as pads
 import pyarrow.fs as pafs
 # Configure PyArrow's native S3 filesystem
 s3 = pafs.S3FileSystem(
    access_key='your_access_key',
    secret_key='your_secret_key',
    endpoint_override='localhost:8333',
    scheme='http',
    allow_bucket_creation=True,
    allow_bucket_deletion=True
 )
 # Write dataset
 table = pa.table({'id': [1, 2, 3], 'value': ['a', 'b', 'c']})
 pads.write_dataset(table, 'bucket/dataset', filesystem=s3)
 # Read dataset (all methods work!)
 table = pq.read_table('bucket/dataset', filesystem=s3)  # ✅
 dataset = pq.ParquetDataset('bucket/dataset', filesystem=s3)  # ✅
 dataset = pads.dataset('bucket/dataset', filesystem=s3)  # ✅
 ```
 ## Test Files
 ### Main Test Suite
 - **`s3_parquet_test.py`** - Comprehensive PyArrow test suite
  - Tests 2 write methods × 5 read methods × 2 dataset sizes = 20 combinations
  - Uses s3fs library for S3 operations
  - All tests pass with the implicit directory fix ✅
 ### PyArrow Native S3 Tests
 - **`test_pyarrow_native_s3.py`** - PyArrow's native S3 filesystem tests
  - Tests PyArrow's built-in S3FileSystem (pyarrow.fs.S3FileSystem)
  - Pure PyArrow solution without s3fs dependency
  - Tests 3 read methods × 2 dataset sizes = 6 scenarios
  - All tests pass ✅
 - **`test_sse_s3_compatibility.py`** - SSE-S3 encryption compatibility tests
  - Tests PyArrow native S3 with SSE-S3 server-side encryption
  - Tests 5 different file sizes (10 to 500,000 rows)
  - Verifies multipart upload encryption works correctly
  - All tests pass ✅
 ### Implicit Directory Tests
 - **`test_implicit_directory_fix.py`** - Specific tests for the implicit directory fix
  - Tests HEAD request behavior
@ -69,6 +135,12 @@ dataset = pq.ParquetDataset('bucket/dataset', filesystem=fs)  # ✅
  - Tests PyArrow dataset reading
  - All 6 tests pass ✅
 ### Examples
 - **`example_pyarrow_native.py`** - Simple standalone example
  - Demonstrates PyArrow's native S3 filesystem usage
  - Can be run with `uv run` or regular Python
  - Minimal dependencies (pyarrow, boto3)
 ### Configuration
 - **`Makefile`** - Build and test automation
 - **`requirements.txt`** - Python dependencies (pyarrow, s3fs, boto3)
@ -128,6 +200,9 @@ make test                 # Run full tests (assumes server is already running)
 make test-with-server     # Run full PyArrow test suite with server (small + large files)
 make test-quick           # Run quick tests with small files only (assumes server is running)
 make test-implicit-dir-with-server  # Run implicit directory tests with server
 make test-native-s3       # Run PyArrow native S3 tests (assumes server is running)
 make test-native-s3-with-server  # Run PyArrow native S3 tests with server management
 make test-sse-s3-compat   # Run comprehensive SSE-S3 encryption compatibility tests
 # Server Management
 make start-seaweedfs-ci   # Start SeaweedFS in background (CI mode)
@ -146,10 +221,20 @@ The tests are automatically run in GitHub Actions on every push/PR that affects
 **Test Matrix**:
 - Python versions: 3.9, 3.11, 3.12
 - PyArrow integration tests: 20 test combinations
 - PyArrow integration tests (s3fs): 20 test combinations
 - PyArrow native S3 tests: 6 test scenarios ✅ **NEW**
 - SSE-S3 encryption tests: 5 file sizes ✅ **NEW**
 - Implicit directory fix tests: 6 test scenarios
 - Go unit tests: 17 test cases
 **Test Steps** (run for each Python version):
 1. Build SeaweedFS
 2. Run PyArrow Parquet integration tests (`make test-with-server`)
 3. Run implicit directory fix tests (`make test-implicit-dir-with-server`)
 4. Run PyArrow native S3 filesystem tests (`make test-native-s3-with-server`) ✅ **NEW**
 5. Run SSE-S3 encryption compatibility tests (`make test-sse-s3-compat`) ✅ **NEW**
 6. Run Go unit tests for implicit directory handling
 **Triggers**:
 - Push/PR to master (when `weed/s3api/**` or `weed/filer/**` changes)
 - Manual trigger via GitHub UI (workflow_dispatch)
--- a/test/s3/parquet/example_pyarrow_native.py
+++ b/test/s3/parquet/example_pyarrow_native.py
@ -0,0 +1,134 @@
 #!/usr/bin/env python3
 # /// script
 # dependencies = [
 #     "pyarrow>=22",
 #     "boto3>=1.28.0",
 # ]
 # ///
 """
 Simple example of using PyArrow's native S3 filesystem with SeaweedFS.
 This is a minimal example demonstrating how to write and read Parquet files
 using PyArrow's built-in S3FileSystem without any additional dependencies
 like s3fs.
 Usage:
    # Set environment variables
    export S3_ENDPOINT_URL=localhost:8333
    export S3_ACCESS_KEY=some_access_key1
    export S3_SECRET_KEY=some_secret_key1
    export BUCKET_NAME=test-parquet-bucket
    # Run the script
    python3 example_pyarrow_native.py
    # Or run with uv (if available)
    uv run example_pyarrow_native.py
 """
 import os
 import secrets
 import pyarrow as pa
 import pyarrow.dataset as pads
 import pyarrow.fs as pafs
 import pyarrow.parquet as pq
 from parquet_test_utils import create_sample_table
 # Configuration
 BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
 S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "localhost:8333")
 S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY", "some_access_key1")
 S3_SECRET_KEY = os.getenv("S3_SECRET_KEY", "some_secret_key1")
 # Determine scheme from endpoint
 if S3_ENDPOINT_URL.startswith("http://"):
    scheme = "http"
    endpoint = S3_ENDPOINT_URL[7:]
 elif S3_ENDPOINT_URL.startswith("https://"):
    scheme = "https"
    endpoint = S3_ENDPOINT_URL[8:]
 else:
    scheme = "http"  # Default to http for localhost
    endpoint = S3_ENDPOINT_URL
 print(f"Connecting to S3 endpoint: {scheme}://{endpoint}")
 # Initialize PyArrow's NATIVE S3 filesystem
 s3 = pafs.S3FileSystem(
    access_key=S3_ACCESS_KEY,
    secret_key=S3_SECRET_KEY,
    endpoint_override=endpoint,
    scheme=scheme,
    allow_bucket_creation=True,
    allow_bucket_deletion=True,
 )
 print("✓ Connected to S3 endpoint")
 # Create bucket if needed (using boto3)
 try:
    import boto3
    from botocore.exceptions import ClientError
    s3_client = boto3.client(
        's3',
        endpoint_url=f"{scheme}://{endpoint}",
        aws_access_key_id=S3_ACCESS_KEY,
        aws_secret_access_key=S3_SECRET_KEY,
        region_name='us-east-1',
    )
    try:
        s3_client.head_bucket(Bucket=BUCKET_NAME)
        print(f"✓ Bucket exists: {BUCKET_NAME}")
    except ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"Creating bucket: {BUCKET_NAME}")
            s3_client.create_bucket(Bucket=BUCKET_NAME)
            print(f"✓ Bucket created: {BUCKET_NAME}")
        else:
            raise
 except ImportError:
    print("Warning: boto3 not available, assuming bucket exists")
 # Generate a unique filename
 filename = f"{BUCKET_NAME}/dataset-{secrets.token_hex(8)}/test.parquet"
 print(f"\nWriting Parquet dataset to: {filename}")
 # Write dataset
 table = create_sample_table(200_000)
 pads.write_dataset(
    table,
    filename,
    filesystem=s3,
    format="parquet",
 )
 print(f"✓ Wrote {table.num_rows:,} rows")
 # Read with pq.read_table
 print("\nReading with pq.read_table...")
 table_read = pq.read_table(filename, filesystem=s3)
 print(f"✓ Read {table_read.num_rows:,} rows")
 # Read with pq.ParquetDataset
 print("\nReading with pq.ParquetDataset...")
 dataset = pq.ParquetDataset(filename, filesystem=s3)
 table_dataset = dataset.read()
 print(f"✓ Read {table_dataset.num_rows:,} rows")
 # Read with pads.dataset
 print("\nReading with pads.dataset...")
 dataset_pads = pads.dataset(filename, filesystem=s3)
 table_pads = dataset_pads.to_table()
 print(f"✓ Read {table_pads.num_rows:,} rows")
 print("\n✅ All operations completed successfully!")
 print(f"\nFile written to: {filename}")
 print("You can verify the file using the SeaweedFS S3 API or weed shell")
--- a/test/s3/parquet/parquet_test_utils.py
+++ b/test/s3/parquet/parquet_test_utils.py
@ -0,0 +1,41 @@
 """
 Shared utility functions for PyArrow Parquet tests.
 This module provides common test utilities used across multiple test scripts
 to avoid code duplication and ensure consistency.
 """
 import pyarrow as pa
 def create_sample_table(num_rows: int = 5) -> pa.Table:
    """Create a sample PyArrow table for testing.
    Args:
        num_rows: Number of rows to generate (default: 5)
    Returns:
        PyArrow Table with test data containing:
        - id: int64 sequential IDs (0 to num_rows-1)
        - name: string user names (user_0, user_1, ...)
        - value: float64 values (id * 1.5)
        - flag: bool alternating True/False based on even/odd id
    Example:
        >>> table = create_sample_table(3)
        >>> print(table)
        pyarrow.Table
        id: int64
        name: string
        value: double
        flag: bool
    """
    return pa.table(
        {
            "id": pa.array(range(num_rows), type=pa.int64()),
            "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),
            "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
            "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),
        }
    )
--- a/test/s3/parquet/test_pyarrow_native_s3.py
+++ b/test/s3/parquet/test_pyarrow_native_s3.py
@ -0,0 +1,383 @@
 #!/usr/bin/env python3
 """
 Test script for PyArrow's NATIVE S3 filesystem with SeaweedFS.
 This test uses PyArrow's built-in S3FileSystem (pyarrow.fs.S3FileSystem)
 instead of s3fs, providing a pure PyArrow solution for reading and writing
 Parquet files to S3-compatible storage.
 Requirements:
    - pyarrow>=10.0.0
 Environment Variables:
    S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333)
    S3_ACCESS_KEY: S3 access key (default: some_access_key1)
    S3_SECRET_KEY: S3 secret key (default: some_secret_key1)
    BUCKET_NAME: S3 bucket name (default: test-parquet-bucket)
    TEST_QUICK: Run only small/quick tests (default: 0, set to 1 for quick mode)
 Usage:
    # Run with default environment variables
    python3 test_pyarrow_native_s3.py
    # Run with custom environment variables
    S3_ENDPOINT_URL=localhost:8333 \
    S3_ACCESS_KEY=mykey \
    S3_SECRET_KEY=mysecret \
    BUCKET_NAME=mybucket \
    python3 test_pyarrow_native_s3.py
 """
 import os
 import secrets
 import sys
 import logging
 from typing import Optional
 import pyarrow as pa
 import pyarrow.dataset as pads
 import pyarrow.fs as pafs
 import pyarrow.parquet as pq
 try:
    import boto3
    from botocore.exceptions import ClientError
    HAS_BOTO3 = True
 except ImportError:
    HAS_BOTO3 = False
 from parquet_test_utils import create_sample_table
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 # Configuration from environment variables with defaults
 S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333")
 S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")
 S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")
 BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
 TEST_QUICK = os.getenv("TEST_QUICK", "0") == "1"
 # Create randomized test directory
 TEST_RUN_ID = secrets.token_hex(8)
 TEST_DIR = f"parquet-native-tests/{TEST_RUN_ID}"
 # Test file sizes
 TEST_SIZES = {
    "small": 5,
    "large": 200_000,  # This will create multiple row groups
 }
 # Filter to only small tests if quick mode is enabled
 if TEST_QUICK:
    TEST_SIZES = {"small": TEST_SIZES["small"]}
    logging.info("Quick test mode enabled - running only small tests")
 def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:
    """Initialize PyArrow's native S3 filesystem.
    Returns:
        tuple: (S3FileSystem instance, scheme, endpoint)
    """
    try:
        logging.info("Initializing PyArrow S3FileSystem...")
        logging.info(f"  Endpoint: {S3_ENDPOINT_URL}")
        logging.info(f"  Bucket: {BUCKET_NAME}")
        # Determine scheme from endpoint
        if S3_ENDPOINT_URL.startswith("http://"):
            scheme = "http"
            endpoint = S3_ENDPOINT_URL[7:]  # Remove http://
        elif S3_ENDPOINT_URL.startswith("https://"):
            scheme = "https"
            endpoint = S3_ENDPOINT_URL[8:]  # Remove https://
        else:
            # Default to http for localhost
            scheme = "http"
            endpoint = S3_ENDPOINT_URL
        # Enable bucket creation and deletion for testing
        s3 = pafs.S3FileSystem(
            access_key=S3_ACCESS_KEY,
            secret_key=S3_SECRET_KEY,
            endpoint_override=endpoint,
            scheme=scheme,
            allow_bucket_creation=True,
            allow_bucket_deletion=True,
        )
        logging.info("✓ PyArrow S3FileSystem initialized successfully\n")
        return s3, scheme, endpoint
    except Exception:
        logging.exception("✗ Failed to initialize PyArrow S3FileSystem")
        return None, "", ""
 def ensure_bucket_exists_boto3(scheme: str, endpoint: str) -> bool:
    """Ensure the test bucket exists using boto3."""
    if not HAS_BOTO3:
        logging.error("boto3 is required for bucket creation")
        return False
    try:
        # Create boto3 client
        endpoint_url = f"{scheme}://{endpoint}"
        s3_client = boto3.client(
            's3',
            endpoint_url=endpoint_url,
            aws_access_key_id=S3_ACCESS_KEY,
            aws_secret_access_key=S3_SECRET_KEY,
            region_name='us-east-1',
        )
        # Check if bucket exists
        try:
            s3_client.head_bucket(Bucket=BUCKET_NAME)
            logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
            return True
        except ClientError as e:
            error_code = e.response['Error']['Code']
            if error_code == '404':
                # Bucket doesn't exist, create it
                logging.info(f"Creating bucket: {BUCKET_NAME}")
                s3_client.create_bucket(Bucket=BUCKET_NAME)
                logging.info(f"✓ Bucket created: {BUCKET_NAME}")
                return True
            else:
                raise
    except Exception:
        logging.exception("✗ Failed to create/check bucket")
        return False
 def ensure_bucket_exists(s3: pafs.S3FileSystem) -> bool:
    """Ensure the test bucket exists using PyArrow's native S3FileSystem."""
    try:
        # Check if bucket exists by trying to list it
        try:
            file_info = s3.get_file_info(BUCKET_NAME)
            if file_info.type == pafs.FileType.Directory:
                logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
                return True
        except OSError as e:
            # OSError typically means bucket not found or network/permission issues
            error_msg = str(e).lower()
            if "not found" in error_msg or "does not exist" in error_msg or "nosuchbucket" in error_msg:
                logging.debug(f"Bucket '{BUCKET_NAME}' not found, will attempt creation: {e}")
            else:
                # Log other OSErrors (network, auth, etc.) for debugging
                logging.debug(f"Error checking bucket '{BUCKET_NAME}', will attempt creation anyway: {type(e).__name__}: {e}")
        except Exception as e:
            # Catch any other unexpected exceptions and log them
            logging.debug(f"Unexpected error checking bucket '{BUCKET_NAME}', will attempt creation: {type(e).__name__}: {e}")
        # Try to create the bucket
        logging.info(f"Creating bucket: {BUCKET_NAME}")
        s3.create_dir(BUCKET_NAME)
        logging.info(f"✓ Bucket created: {BUCKET_NAME}")
        return True
    except Exception:
        logging.exception(f"✗ Failed to create/check bucket '{BUCKET_NAME}' with PyArrow")
        return False
 def test_write_and_read(s3: pafs.S3FileSystem, test_name: str, num_rows: int) -> tuple[bool, str]:
    """Test writing and reading a Parquet dataset using PyArrow's native S3 filesystem."""
    try:
        table = create_sample_table(num_rows)
        # Write using pads.write_dataset
        filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet"
        logging.info(f"  Writing {num_rows:,} rows to {filename}...")
        pads.write_dataset(
            table,
            filename,
            filesystem=s3,
            format="parquet",
        )
        logging.info("  ✓ Write completed")
        # Test Method 1: Read with pq.read_table
        logging.info("  Reading with pq.read_table...")
        table_read = pq.read_table(filename, filesystem=s3)
        if table_read.num_rows != num_rows:
            return False, f"pq.read_table: Row count mismatch (expected {num_rows}, got {table_read.num_rows})"
        # Check schema first
        if not table_read.schema.equals(table.schema):
            return False, f"pq.read_table: Schema mismatch (expected {table.schema}, got {table_read.schema})"
        # Sort both tables by 'id' column before comparison to handle potential row order differences
        table_sorted = table.sort_by([('id', 'ascending')])
        table_read_sorted = table_read.sort_by([('id', 'ascending')])
        if not table_read_sorted.equals(table_sorted):
            # Provide more detailed error information
            error_details = []
            for col_name in table.column_names:
                col_original = table_sorted.column(col_name)
                col_read = table_read_sorted.column(col_name)
                if not col_original.equals(col_read):
                    error_details.append(f"column '{col_name}' differs")
            return False, f"pq.read_table: Table contents mismatch ({', '.join(error_details)})"
        logging.info(f"  ✓ pq.read_table: {table_read.num_rows:,} rows")
        # Test Method 2: Read with pq.ParquetDataset
        logging.info("  Reading with pq.ParquetDataset...")
        dataset = pq.ParquetDataset(filename, filesystem=s3)
        table_dataset = dataset.read()
        if table_dataset.num_rows != num_rows:
            return False, f"pq.ParquetDataset: Row count mismatch (expected {num_rows}, got {table_dataset.num_rows})"
        # Sort before comparison
        table_dataset_sorted = table_dataset.sort_by([('id', 'ascending')])
        if not table_dataset_sorted.equals(table_sorted):
            error_details = []
            for col_name in table.column_names:
                col_original = table_sorted.column(col_name)
                col_read = table_dataset_sorted.column(col_name)
                if not col_original.equals(col_read):
                    error_details.append(f"column '{col_name}' differs")
            return False, f"pq.ParquetDataset: Table contents mismatch ({', '.join(error_details)})"
        logging.info(f"  ✓ pq.ParquetDataset: {table_dataset.num_rows:,} rows")
        # Test Method 3: Read with pads.dataset
        logging.info("  Reading with pads.dataset...")
        dataset_pads = pads.dataset(filename, filesystem=s3)
        table_pads = dataset_pads.to_table()
        if table_pads.num_rows != num_rows:
            return False, f"pads.dataset: Row count mismatch (expected {num_rows}, got {table_pads.num_rows})"
        # Sort before comparison
        table_pads_sorted = table_pads.sort_by([('id', 'ascending')])
        if not table_pads_sorted.equals(table_sorted):
            error_details = []
            for col_name in table.column_names:
                col_original = table_sorted.column(col_name)
                col_read = table_pads_sorted.column(col_name)
                if not col_original.equals(col_read):
                    error_details.append(f"column '{col_name}' differs")
            return False, f"pads.dataset: Table contents mismatch ({', '.join(error_details)})"
        logging.info(f"  ✓ pads.dataset: {table_pads.num_rows:,} rows")
        return True, "All read methods passed"
    except Exception as exc:
        logging.exception("  ✗ Test failed")
        return False, f"{type(exc).__name__}: {exc}"
 def cleanup_test_files(s3: pafs.S3FileSystem) -> None:
    """Clean up test files from S3.
    Note: We cannot use s3.delete_dir() directly because SeaweedFS uses implicit
    directories (path prefixes without physical directory objects). PyArrow's
    delete_dir() attempts to delete the directory marker itself, which fails with
    "INTERNAL_FAILURE" on SeaweedFS. Instead, we list and delete files individually,
    letting implicit directories disappear automatically.
    """
    try:
        test_path = f"{BUCKET_NAME}/{TEST_DIR}"
        logging.info(f"Cleaning up test directory: {test_path}")
        # List and delete files individually to handle implicit directories
        try:
            file_selector = pafs.FileSelector(test_path, recursive=True)
            files = s3.get_file_info(file_selector)
            # Delete files first (not directories)
            for file_info in files:
                if file_info.type == pafs.FileType.File:
                    s3.delete_file(file_info.path)
                    logging.debug(f"  Deleted file: {file_info.path}")
            logging.info("✓ Test directory cleaned up")
        except OSError as e:
            # Handle the case where the path doesn't exist or is inaccessible
            if "does not exist" in str(e).lower() or "not found" in str(e).lower():
                logging.info("✓ Test directory already clean or doesn't exist")
            else:
                raise
    except Exception:
        logging.exception("Failed to cleanup test directory")
 def main():
    """Run all tests with PyArrow's native S3 filesystem."""
    print("=" * 80)
    print("PyArrow Native S3 Filesystem Tests for SeaweedFS")
    print("Testing Parquet Files with Multiple Row Groups")
    if TEST_QUICK:
        print("*** QUICK TEST MODE - Small files only ***")
    print("=" * 80 + "\n")
    print("Configuration:")
    print(f"  S3 Endpoint: {S3_ENDPOINT_URL}")
    print(f"  Access Key: {S3_ACCESS_KEY}")
    print(f"  Bucket: {BUCKET_NAME}")
    print(f"  Test Directory: {TEST_DIR}")
    print(f"  Quick Mode: {'Yes (small files only)' if TEST_QUICK else 'No (all file sizes)'}")
    print(f"  PyArrow Version: {pa.__version__}")
    print()
    # Initialize S3 filesystem
    s3, scheme, endpoint = init_s3_filesystem()
    if s3 is None:
        print("Cannot proceed without S3 connection")
        return 1
    # Ensure bucket exists - try PyArrow first, fall back to boto3
    bucket_created = ensure_bucket_exists(s3)
    if not bucket_created:
        logging.info("Trying to create bucket with boto3...")
        bucket_created = ensure_bucket_exists_boto3(scheme, endpoint)
    if not bucket_created:
        print("Cannot proceed without bucket")
        return 1
    results = []
    # Test all file sizes
    for size_name, num_rows in TEST_SIZES.items():
        print(f"\n{'='*80}")
        print(f"Testing with {size_name} files ({num_rows:,} rows)")
        print(f"{'='*80}\n")
        test_name = f"{size_name}_test"
        success, message = test_write_and_read(s3, test_name, num_rows)
        results.append((test_name, success, message))
        status = "✓ PASS" if success else "✗ FAIL"
        print(f"\n{status}: {message}\n")
    # Summary
    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    passed = sum(1 for _, success, _ in results if success)
    total = len(results)
    print(f"\nTotal: {passed}/{total} passed\n")
    for test_name, success, message in results:
        status = "✓" if success else "✗"
        print(f"  {status} {test_name}: {message}")
    print("\n" + "=" * 80)
    if passed == total:
        print("✓ ALL TESTS PASSED!")
    else:
        print(f"✗ {total - passed} test(s) failed")
    print("=" * 80 + "\n")
    # Cleanup
    cleanup_test_files(s3)
    return 0 if passed == total else 1
 if __name__ == "__main__":
    sys.exit(main())
--- a/test/s3/parquet/test_sse_s3_compatibility.py
+++ b/test/s3/parquet/test_sse_s3_compatibility.py
@ -0,0 +1,254 @@
 #!/usr/bin/env python3
 """
 Test script for SSE-S3 compatibility with PyArrow native S3 filesystem.
 This test specifically targets the SSE-S3 multipart upload bug where
 SeaweedFS panics with "bad IV length" when reading multipart uploads
 that were encrypted with bucket-default SSE-S3.
 Requirements:
    - pyarrow>=10.0.0
    - boto3>=1.28.0
 Environment Variables:
    S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333)
    S3_ACCESS_KEY: S3 access key (default: some_access_key1)
    S3_SECRET_KEY: S3 secret key (default: some_secret_key1)
    BUCKET_NAME: S3 bucket name (default: test-parquet-bucket)
 Usage:
    # Start SeaweedFS with SSE-S3 enabled
    make start-seaweedfs-ci ENABLE_SSE_S3=true
    # Run the test
    python3 test_sse_s3_compatibility.py
 """
 import os
 import secrets
 import sys
 import logging
 from typing import Optional
 import pyarrow as pa
 import pyarrow.dataset as pads
 import pyarrow.fs as pafs
 import pyarrow.parquet as pq
 try:
    import boto3
    from botocore.exceptions import ClientError
    HAS_BOTO3 = True
 except ImportError:
    HAS_BOTO3 = False
    logging.exception("boto3 is required for this test")
    sys.exit(1)
 from parquet_test_utils import create_sample_table
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 # Configuration
 S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333")
 S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")
 S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")
 BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
 TEST_RUN_ID = secrets.token_hex(8)
 TEST_DIR = f"sse-s3-tests/{TEST_RUN_ID}"
 # Test sizes designed to trigger multipart uploads
 # PyArrow typically uses 5MB chunks, so these sizes should trigger multipart
 TEST_SIZES = {
    "tiny": 10,                    # Single part
    "small": 1_000,               # Single part
    "medium": 50_000,             # Single part (~1.5MB)
    "large": 200_000,             # Multiple parts (~6MB)
    "very_large": 500_000,        # Multiple parts (~15MB)
 }
 def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:
    """Initialize PyArrow's native S3 filesystem."""
    try:
        logging.info("Initializing PyArrow S3FileSystem...")
        # Determine scheme from endpoint
        if S3_ENDPOINT_URL.startswith("http://"):
            scheme = "http"
            endpoint = S3_ENDPOINT_URL[7:]
        elif S3_ENDPOINT_URL.startswith("https://"):
            scheme = "https"
            endpoint = S3_ENDPOINT_URL[8:]
        else:
            scheme = "http"
            endpoint = S3_ENDPOINT_URL
        s3 = pafs.S3FileSystem(
            access_key=S3_ACCESS_KEY,
            secret_key=S3_SECRET_KEY,
            endpoint_override=endpoint,
            scheme=scheme,
            allow_bucket_creation=True,
            allow_bucket_deletion=True,
        )
        logging.info("✓ PyArrow S3FileSystem initialized\n")
        return s3, scheme, endpoint
    except Exception:
        logging.exception("✗ Failed to initialize PyArrow S3FileSystem")
        return None, "", ""
 def ensure_bucket_exists(scheme: str, endpoint: str) -> bool:
    """Ensure the test bucket exists using boto3."""
    try:
        endpoint_url = f"{scheme}://{endpoint}"
        s3_client = boto3.client(
            's3',
            endpoint_url=endpoint_url,
            aws_access_key_id=S3_ACCESS_KEY,
            aws_secret_access_key=S3_SECRET_KEY,
            region_name='us-east-1',
        )
        try:
            s3_client.head_bucket(Bucket=BUCKET_NAME)
            logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
        except ClientError as e:
            error_code = e.response['Error']['Code']
            if error_code == '404':
                logging.info(f"Creating bucket: {BUCKET_NAME}")
                s3_client.create_bucket(Bucket=BUCKET_NAME)
                logging.info(f"✓ Bucket created: {BUCKET_NAME}")
            else:
                logging.exception("✗ Failed to access bucket")
                return False
        # Note: SeaweedFS doesn't support GetBucketEncryption API
        # so we can't verify if SSE-S3 is enabled via API
        # We assume it's configured correctly in the s3.json config file
        logging.info("✓ Assuming SSE-S3 is configured in s3.json")
        return True
    except Exception:
        logging.exception("✗ Failed to check bucket")
        return False
 def test_write_read_with_sse(
    s3: pafs.S3FileSystem,
    test_name: str,
    num_rows: int
 ) -> tuple[bool, str, int]:
    """Test writing and reading with SSE-S3 encryption."""
    try:
        table = create_sample_table(num_rows)
        filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet"
        logging.info(f"  Writing {num_rows:,} rows...")
        pads.write_dataset(
            table,
            filename,
            filesystem=s3,
            format="parquet",
        )
        logging.info("  Reading back...")
        table_read = pq.read_table(filename, filesystem=s3)
        if table_read.num_rows != num_rows:
            return False, f"Row count mismatch: {table_read.num_rows} != {num_rows}", 0
        return True, "Success", table_read.num_rows
    except Exception as e:
        error_msg = f"{type(e).__name__}: {e!s}"
        logging.exception("  ✗ Failed")
        return False, error_msg, 0
 def main():
    """Run SSE-S3 compatibility tests."""
    print("=" * 80)
    print("SSE-S3 Compatibility Tests for PyArrow Native S3")
    print("Testing Multipart Upload Encryption")
    print("=" * 80 + "\n")
    print("Configuration:")
    print(f"  S3 Endpoint: {S3_ENDPOINT_URL}")
    print(f"  Bucket: {BUCKET_NAME}")
    print(f"  Test Directory: {TEST_DIR}")
    print(f"  PyArrow Version: {pa.__version__}")
    print()
    # Initialize
    s3, scheme, endpoint = init_s3_filesystem()
    if s3 is None:
        print("Cannot proceed without S3 connection")
        return 1
    # Check bucket and SSE-S3
    if not ensure_bucket_exists(scheme, endpoint):
        print("\n⚠ WARNING: Failed to access or create the test bucket!")
        print("This test requires a reachable bucket with SSE-S3 enabled.")
        print("Please ensure SeaweedFS is running with: make start-seaweedfs-ci ENABLE_SSE_S3=true")
        return 1
    print()
    results = []
    # Test all sizes
    for size_name, num_rows in TEST_SIZES.items():
        print(f"\n{'='*80}")
        print(f"Testing {size_name} dataset ({num_rows:,} rows)")
        print(f"{'='*80}")
        success, message, rows_read = test_write_read_with_sse(
            s3, size_name, num_rows
        )
        results.append((size_name, num_rows, success, message, rows_read))
        if success:
            print(f"  ✓ SUCCESS: Read {rows_read:,} rows")
        else:
            print(f"  ✗ FAILED: {message}")
    # Summary
    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    passed = sum(1 for _, _, success, _, _ in results if success)
    total = len(results)
    print(f"\nTotal: {passed}/{total} tests passed\n")
    print(f"{'Size':<15} {'Rows':>10} {'Status':<10} {'Rows Read':>10} {'Message':<40}")
    print("-" * 90)
    for size_name, num_rows, success, message, rows_read in results:
        status = "✓ PASS" if success else "✗ FAIL"
        rows_str = f"{rows_read:,}" if success else "N/A"
        print(f"{size_name:<15} {num_rows:>10,} {status:<10} {rows_str:>10} {message[:40]}")
    print("\n" + "=" * 80)
    if passed == total:
        print("✓ ALL TESTS PASSED WITH SSE-S3!")
        print("\nThis means:")
        print("  - SSE-S3 encryption is working correctly")
        print("  - PyArrow native S3 filesystem is compatible")
        print("  - Multipart uploads are handled properly")
    else:
        print(f"✗ {total - passed} test(s) failed")
        print("\nPossible issues:")
        print("  - SSE-S3 multipart upload bug with empty IV")
        print("  - Encryption/decryption mismatch")
        print("  - File corruption during upload")
    print("=" * 80 + "\n")
    return 0 if passed == total else 1
 if __name__ == "__main__":
    sys.exit(main())