add sse-s3 tests

2 weeks ago · 0e45e5bb10
4 changed files with 340 additions and 3 deletions
--- a/.github/workflows/s3-parquet-tests.yml
+++ b/.github/workflows/s3-parquet-tests.yml
@ -97,6 +97,17 @@ jobs:
          VOLUME_PORT: 8080
          MASTER_PORT: 9333
      - name: Run SSE-S3 encryption compatibility tests
        run: |
          cd test/s3/parquet
          make test-sse-s3-compat
        env:
          SEAWEEDFS_BINARY: weed
          S3_PORT: 8333
          FILER_PORT: 8888
          VOLUME_PORT: 8080
          MASTER_PORT: 9333
      - name: Upload test logs on failure
        if: failure()
        uses: actions/upload-artifact@v4
--- a/test/s3/parquet/Makefile
+++ b/test/s3/parquet/Makefile
@ -13,6 +13,7 @@ SECRET_KEY ?= some_secret_key1
 VOLUME_MAX_SIZE_MB ?= 50
 VOLUME_MAX_COUNT ?= 100
 BUCKET_NAME ?= test-parquet-bucket
 ENABLE_SSE_S3 ?= false
 # Python configuration
 PYTHON ?= python3
@ -29,7 +30,7 @@ GREEN := \033[0;32m
 YELLOW := \033[1;33m
 NC := \033[0m # No Color
 .PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-native-s3 test-native-s3-with-server test-quick test-with-server
 .PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-native-s3 test-native-s3-with-server test-native-s3-with-sse test-quick test-sse-s3-compat test-with-server
 all: test
@ -50,6 +51,8 @@ help:
 	@echo "  test-implicit-dir-with-server - Test implicit directory fix with server management"
 	@echo "  test-native-s3    - Test PyArrow's native S3 filesystem (assumes server running)"
 	@echo "  test-native-s3-with-server - Test PyArrow's native S3 filesystem with server management"
 	@echo "  test-native-s3-with-sse - Test PyArrow's native S3 with SSE-S3 encryption enabled"
 	@echo "  test-sse-s3-compat - Comprehensive SSE-S3 compatibility test (multipart uploads)"
 	@echo "  setup-python      - Setup Python virtual environment and install dependencies"
 	@echo "  check-python      - Check if Python and required packages are available"
 	@echo "  start-seaweedfs   - Start SeaweedFS server for testing"
@ -133,7 +136,13 @@ start-seaweedfs-ci: check-binary
 	# Start filer server with embedded S3
 	@echo "Starting filer server with embedded S3..."
 	@printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json
 	@if [ "$(ENABLE_SSE_S3)" = "true" ]; then \
 		echo "  SSE-S3 encryption: ENABLED"; \
 		printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}],"buckets":[{"name":"$(BUCKET_NAME)","encryption":{"sseS3":{"enabled":true}}}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json; \
 	else \
 		echo "  SSE-S3 encryption: DISABLED"; \
 		printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json; \
 	fi
 	@AWS_ACCESS_KEY_ID=$(ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(SECRET_KEY) nohup $(SEAWEEDFS_BINARY) filer -port=$(FILER_PORT) -port.grpc=$$(( $(FILER_PORT) + 10000 )) -master=127.0.0.1:$(MASTER_PORT) -dataCenter=defaultDataCenter -ip=127.0.0.1 -s3 -s3.port=$(S3_PORT) -s3.config=/tmp/seaweedfs-parquet-s3.json > /tmp/seaweedfs-parquet-filer.log 2>&1 &
 	@sleep 5
@ -394,6 +403,50 @@ test-native-s3-with-server: build-weed setup-python
 		exit 1; \
 	fi
 # Test PyArrow's native S3 filesystem with SSE-S3 encryption
 test-native-s3-with-sse: build-weed setup-python
 	@echo "🚀 Starting PyArrow native S3 filesystem tests with SSE-S3 encryption..."
 	@echo "Starting SeaweedFS cluster with SSE-S3 enabled..."
 	@if $(MAKE) start-seaweedfs-ci ENABLE_SSE_S3=true > weed-test-sse.log 2>&1; then \
 		echo "✅ SeaweedFS cluster started successfully with SSE-S3"; \
 		echo "Running PyArrow native S3 filesystem tests with SSE-S3..."; \
 		trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \
 		S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
 		S3_ACCESS_KEY=$(ACCESS_KEY) \
 		S3_SECRET_KEY=$(SECRET_KEY) \
 		BUCKET_NAME=$(BUCKET_NAME) \
 		$(VENV_DIR)/bin/$(PYTHON) test_pyarrow_native_s3.py || exit 1; \
 		echo "✅ All SSE-S3 tests completed successfully"; \
 		$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true; \
 	else \
 		echo "❌ Failed to start SeaweedFS cluster with SSE-S3"; \
 		echo "=== Server startup logs ==="; \
 		tail -100 weed-test-sse.log 2>/dev/null || echo "No startup log available"; \
 		exit 1; \
 	fi
 # Comprehensive SSE-S3 compatibility test
 test-sse-s3-compat: build-weed setup-python
 	@echo "🚀 Starting comprehensive SSE-S3 compatibility tests..."
 	@echo "Starting SeaweedFS cluster with SSE-S3 enabled..."
 	@if $(MAKE) start-seaweedfs-ci ENABLE_SSE_S3=true > weed-test-sse-compat.log 2>&1; then \
 		echo "✅ SeaweedFS cluster started successfully with SSE-S3"; \
 		echo "Running comprehensive SSE-S3 compatibility tests..."; \
 		trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \
 		S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
 		S3_ACCESS_KEY=$(ACCESS_KEY) \
 		S3_SECRET_KEY=$(SECRET_KEY) \
 		BUCKET_NAME=$(BUCKET_NAME) \
 		$(VENV_DIR)/bin/$(PYTHON) test_sse_s3_compatibility.py || exit 1; \
 		echo "✅ All SSE-S3 compatibility tests completed successfully"; \
 		$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true; \
 	else \
 		echo "❌ Failed to start SeaweedFS cluster with SSE-S3"; \
 		echo "=== Server startup logs ==="; \
 		tail -100 weed-test-sse-compat.log 2>/dev/null || echo "No startup log available"; \
 		exit 1; \
 	fi
 # CI/CD targets
 ci-test: test-with-server
--- a/test/s3/parquet/README.md
+++ b/test/s3/parquet/README.md
@ -44,6 +44,9 @@ make test-implicit-dir-with-server
 # Run PyArrow native S3 filesystem tests
 make test-native-s3-with-server
 # Run SSE-S3 encryption tests
 make test-sse-s3-compat
 # Clean up
 make clean
 ```
@ -119,6 +122,12 @@ dataset = pads.dataset('bucket/dataset', filesystem=s3)  # ✅
  - Tests 3 read methods × 2 dataset sizes = 6 scenarios
  - All tests pass ✅
 - **`test_sse_s3_compatibility.py`** - SSE-S3 encryption compatibility tests
  - Tests PyArrow native S3 with SSE-S3 server-side encryption
  - Tests 5 different file sizes (10 to 500,000 rows)
  - Verifies multipart upload encryption works correctly
  - All tests pass ✅
 ### Implicit Directory Tests
 - **`test_implicit_directory_fix.py`** - Specific tests for the implicit directory fix
  - Tests HEAD request behavior
@ -193,6 +202,7 @@ make test-quick           # Run quick tests with small files only (assumes serve
 make test-implicit-dir-with-server  # Run implicit directory tests with server
 make test-native-s3       # Run PyArrow native S3 tests (assumes server is running)
 make test-native-s3-with-server  # Run PyArrow native S3 tests with server management
 make test-sse-s3-compat   # Run comprehensive SSE-S3 encryption compatibility tests
 # Server Management
 make start-seaweedfs-ci   # Start SeaweedFS in background (CI mode)
@ -213,6 +223,7 @@ The tests are automatically run in GitHub Actions on every push/PR that affects
 - Python versions: 3.9, 3.11, 3.12
 - PyArrow integration tests (s3fs): 20 test combinations
 - PyArrow native S3 tests: 6 test scenarios ✅ **NEW**
 - SSE-S3 encryption tests: 5 file sizes ✅ **NEW**
 - Implicit directory fix tests: 6 test scenarios
 - Go unit tests: 17 test cases
@ -221,7 +232,8 @@ The tests are automatically run in GitHub Actions on every push/PR that affects
 2. Run PyArrow Parquet integration tests (`make test-with-server`)
 3. Run implicit directory fix tests (`make test-implicit-dir-with-server`)
 4. Run PyArrow native S3 filesystem tests (`make test-native-s3-with-server`) ✅ **NEW**
 5. Run Go unit tests for implicit directory handling
 5. Run SSE-S3 encryption compatibility tests (`make test-sse-s3-compat`) ✅ **NEW**
 6. Run Go unit tests for implicit directory handling
 **Triggers**:
 - Push/PR to master (when `weed/s3api/**` or `weed/filer/**` changes)
--- a/test/s3/parquet/test_sse_s3_compatibility.py
+++ b/test/s3/parquet/test_sse_s3_compatibility.py
@ -0,0 +1,261 @@
 #!/usr/bin/env python3
 """
 Test script for SSE-S3 compatibility with PyArrow native S3 filesystem.
 This test specifically targets the SSE-S3 multipart upload bug where
 SeaweedFS panics with "bad IV length" when reading multipart uploads
 that were encrypted with bucket-default SSE-S3.
 Requirements:
    - pyarrow>=10.0.0
    - boto3>=1.28.0
 Environment Variables:
    S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333)
    S3_ACCESS_KEY: S3 access key (default: some_access_key1)
    S3_SECRET_KEY: S3 secret key (default: some_secret_key1)
    BUCKET_NAME: S3 bucket name (default: test-parquet-bucket)
 Usage:
    # Start SeaweedFS with SSE-S3 enabled
    make start-seaweedfs-ci ENABLE_SSE_S3=true
    # Run the test
    python3 test_sse_s3_compatibility.py
 """
 import os
 import secrets
 import sys
 import logging
 from datetime import datetime
 from typing import Optional
 import pyarrow as pa
 import pyarrow.dataset as pads
 import pyarrow.fs as pafs
 import pyarrow.parquet as pq
 try:
    import boto3
    from botocore.exceptions import ClientError
    HAS_BOTO3 = True
 except ImportError:
    HAS_BOTO3 = False
    logging.error("boto3 is required for this test")
    sys.exit(1)
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 # Configuration
 S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333")
 S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")
 S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")
 BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
 TEST_RUN_ID = secrets.token_hex(8)
 TEST_DIR = f"sse-s3-tests/{TEST_RUN_ID}"
 # Test sizes designed to trigger multipart uploads
 # PyArrow typically uses 5MB chunks, so these sizes should trigger multipart
 TEST_SIZES = {
    "tiny": 10,                    # Single part
    "small": 1_000,               # Single part
    "medium": 50_000,             # Single part (~1.5MB)
    "large": 200_000,             # Multiple parts (~6MB)
    "very_large": 500_000,        # Multiple parts (~15MB)
 }
 def create_sample_table(num_rows: int = 5) -> pa.Table:
    """Create a sample PyArrow table for testing."""
    return pa.table(
        {
            "id": pa.array(range(num_rows), type=pa.int64()),
            "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),
            "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
            "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),
        }
    )
 def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:
    """Initialize PyArrow's native S3 filesystem."""
    try:
        logging.info("Initializing PyArrow S3FileSystem...")
        # Determine scheme from endpoint
        if S3_ENDPOINT_URL.startswith("http://"):
            scheme = "http"
            endpoint = S3_ENDPOINT_URL[7:]
        elif S3_ENDPOINT_URL.startswith("https://"):
            scheme = "https"
            endpoint = S3_ENDPOINT_URL[8:]
        else:
            scheme = "http"
            endpoint = S3_ENDPOINT_URL
        s3 = pafs.S3FileSystem(
            access_key=S3_ACCESS_KEY,
            secret_key=S3_SECRET_KEY,
            endpoint_override=endpoint,
            scheme=scheme,
            allow_bucket_creation=True,
            allow_bucket_deletion=True,
        )
        logging.info("✓ PyArrow S3FileSystem initialized\n")
        return s3, scheme, endpoint
    except Exception as e:
        logging.exception("✗ Failed to initialize PyArrow S3FileSystem")
        return None, "", ""
 def ensure_bucket_exists(scheme: str, endpoint: str) -> bool:
    """Ensure the test bucket exists using boto3."""
    try:
        endpoint_url = f"{scheme}://{endpoint}"
        s3_client = boto3.client(
            's3',
            endpoint_url=endpoint_url,
            aws_access_key_id=S3_ACCESS_KEY,
            aws_secret_access_key=S3_SECRET_KEY,
            region_name='us-east-1',
        )
        try:
            s3_client.head_bucket(Bucket=BUCKET_NAME)
            logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
        except ClientError as e:
            error_code = e.response['Error']['Code']
            if error_code == '404':
                logging.info(f"Creating bucket: {BUCKET_NAME}")
                s3_client.create_bucket(Bucket=BUCKET_NAME)
                logging.info(f"✓ Bucket created: {BUCKET_NAME}")
        # Note: SeaweedFS doesn't support GetBucketEncryption API
        # so we can't verify if SSE-S3 is enabled via API
        # We assume it's configured correctly in the s3.json config file
        logging.info(f"✓ Assuming SSE-S3 is configured in s3.json")
        return True
    except Exception as e:
        logging.exception(f"✗ Failed to check bucket: {e}")
        return False
 def test_write_read_with_sse(
    s3: pafs.S3FileSystem,
    test_name: str,
    num_rows: int
 ) -> tuple[bool, str, int]:
    """Test writing and reading with SSE-S3 encryption."""
    try:
        table = create_sample_table(num_rows)
        filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet"
        logging.info(f"  Writing {num_rows:,} rows...")
        pads.write_dataset(
            table,
            filename,
            filesystem=s3,
            format="parquet",
        )
        logging.info(f"  Reading back...")
        table_read = pq.read_table(filename, filesystem=s3)
        if table_read.num_rows != num_rows:
            return False, f"Row count mismatch: {table_read.num_rows} != {num_rows}", 0
        return True, "Success", table_read.num_rows
    except Exception as e:
        error_msg = f"{type(e).__name__}: {str(e)}"
        logging.error(f"  ✗ Failed: {error_msg}")
        return False, error_msg, 0
 def main():
    """Run SSE-S3 compatibility tests."""
    print("=" * 80)
    print("SSE-S3 Compatibility Tests for PyArrow Native S3")
    print("Testing Multipart Upload Encryption")
    print("=" * 80 + "\n")
    print("Configuration:")
    print(f"  S3 Endpoint: {S3_ENDPOINT_URL}")
    print(f"  Bucket: {BUCKET_NAME}")
    print(f"  Test Directory: {TEST_DIR}")
    print(f"  PyArrow Version: {pa.__version__}")
    print()
    # Initialize
    s3, scheme, endpoint = init_s3_filesystem()
    if s3 is None:
        print("Cannot proceed without S3 connection")
        return 1
    # Check bucket and SSE-S3
    if not ensure_bucket_exists(scheme, endpoint):
        print("\n⚠ WARNING: SSE-S3 is not enabled on the bucket!")
        print("This test requires SSE-S3 encryption to be enabled.")
        print("Please start SeaweedFS with: make start-seaweedfs-ci ENABLE_SSE_S3=true")
        return 1
    print()
    results = []
    # Test all sizes
    for size_name, num_rows in TEST_SIZES.items():
        print(f"\n{'='*80}")
        print(f"Testing {size_name} dataset ({num_rows:,} rows)")
        print(f"{'='*80}")
        success, message, rows_read = test_write_read_with_sse(
            s3, size_name, num_rows
        )
        results.append((size_name, num_rows, success, message, rows_read))
        if success:
            print(f"  ✓ SUCCESS: Read {rows_read:,} rows")
        else:
            print(f"  ✗ FAILED: {message}")
    # Summary
    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    passed = sum(1 for _, _, success, _, _ in results if success)
    total = len(results)
    print(f"\nTotal: {passed}/{total} tests passed\n")
    print(f"{'Size':<15} {'Rows':>10} {'Status':<10} {'Message':<40}")
    print("-" * 80)
    for size_name, num_rows, success, message, rows_read in results:
        status = "✓ PASS" if success else "✗ FAIL"
        print(f"{size_name:<15} {num_rows:>10,} {status:<10} {message[:40]}")
    print("\n" + "=" * 80)
    if passed == total:
        print("✓ ALL TESTS PASSED WITH SSE-S3!")
        print("\nThis means:")
        print("  - SSE-S3 encryption is working correctly")
        print("  - PyArrow native S3 filesystem is compatible")
        print("  - Multipart uploads are handled properly")
    else:
        print(f"✗ {total - passed} test(s) failed")
        print("\nPossible issues:")
        print("  - SSE-S3 multipart upload bug with empty IV")
        print("  - Encryption/decryption mismatch")
        print("  - File corruption during upload")
    print("=" * 80 + "\n")
    return 0 if passed == total else 1
 if __name__ == "__main__":
    sys.exit(main())