add sse-s3 tests

1 month ago · 0e45e5bb10
4 changed files with 340 additions and 3 deletions
--- a/.github/workflows/s3-parquet-tests.yml
+++ b/.github/workflows/s3-parquet-tests.yml
@ -97,6 +97,17 @@ jobs:
          VOLUME_PORT: 8080
          MASTER_PORT: 9333
      
+      - name: Run SSE-S3 encryption compatibility tests
+        run: |
+          cd test/s3/parquet
+          make test-sse-s3-compat
+        env:
+          SEAWEEDFS_BINARY: weed
+          S3_PORT: 8333
+          FILER_PORT: 8888
+          VOLUME_PORT: 8080
+          MASTER_PORT: 9333
+      
      - name: Upload test logs on failure
        if: failure()
        uses: actions/upload-artifact@v4
--- a/test/s3/parquet/Makefile
+++ b/test/s3/parquet/Makefile
@ -13,6 +13,7 @@ SECRET_KEY ?= some_secret_key1
 VOLUME_MAX_SIZE_MB ?= 50
 VOLUME_MAX_COUNT ?= 100
 BUCKET_NAME ?= test-parquet-bucket
+ENABLE_SSE_S3 ?= false

 # Python configuration
 PYTHON ?= python3
@ -29,7 +30,7 @@ GREEN := \033[0;32m
 YELLOW := \033[1;33m
 NC := \033[0m # No Color

-.PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-native-s3 test-native-s3-with-server test-quick test-with-server
+.PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-native-s3 test-native-s3-with-server test-native-s3-with-sse test-quick test-sse-s3-compat test-with-server

 all: test

@ -50,6 +51,8 @@ help:
 	@echo "  test-implicit-dir-with-server - Test implicit directory fix with server management"
 	@echo "  test-native-s3    - Test PyArrow's native S3 filesystem (assumes server running)"
 	@echo "  test-native-s3-with-server - Test PyArrow's native S3 filesystem with server management"
+	@echo "  test-native-s3-with-sse - Test PyArrow's native S3 with SSE-S3 encryption enabled"
+	@echo "  test-sse-s3-compat - Comprehensive SSE-S3 compatibility test (multipart uploads)"
 	@echo "  setup-python      - Setup Python virtual environment and install dependencies"
 	@echo "  check-python      - Check if Python and required packages are available"
 	@echo "  start-seaweedfs   - Start SeaweedFS server for testing"
@ -133,7 +136,13 @@ start-seaweedfs-ci: check-binary
 	
 	# Start filer server with embedded S3
 	@echo "Starting filer server with embedded S3..."
-	@printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json
+	@if [ "$(ENABLE_SSE_S3)" = "true" ]; then \
+		echo "  SSE-S3 encryption: ENABLED"; \
+		printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}],"buckets":[{"name":"$(BUCKET_NAME)","encryption":{"sseS3":{"enabled":true}}}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json; \
+	else \
+		echo "  SSE-S3 encryption: DISABLED"; \
+		printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json; \
+	fi
 	@AWS_ACCESS_KEY_ID=$(ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(SECRET_KEY) nohup $(SEAWEEDFS_BINARY) filer -port=$(FILER_PORT) -port.grpc=$$(( $(FILER_PORT) + 10000 )) -master=127.0.0.1:$(MASTER_PORT) -dataCenter=defaultDataCenter -ip=127.0.0.1 -s3 -s3.port=$(S3_PORT) -s3.config=/tmp/seaweedfs-parquet-s3.json > /tmp/seaweedfs-parquet-filer.log 2>&1 &
 	@sleep 5
 	
@ -394,6 +403,50 @@ test-native-s3-with-server: build-weed setup-python
 		exit 1; \
 	fi

+# Test PyArrow's native S3 filesystem with SSE-S3 encryption
+test-native-s3-with-sse: build-weed setup-python
+	@echo "🚀 Starting PyArrow native S3 filesystem tests with SSE-S3 encryption..."
+	@echo "Starting SeaweedFS cluster with SSE-S3 enabled..."
+	@if $(MAKE) start-seaweedfs-ci ENABLE_SSE_S3=true > weed-test-sse.log 2>&1; then \
+		echo "✅ SeaweedFS cluster started successfully with SSE-S3"; \
+		echo "Running PyArrow native S3 filesystem tests with SSE-S3..."; \
+		trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \
+		S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
+		S3_ACCESS_KEY=$(ACCESS_KEY) \
+		S3_SECRET_KEY=$(SECRET_KEY) \
+		BUCKET_NAME=$(BUCKET_NAME) \
+		$(VENV_DIR)/bin/$(PYTHON) test_pyarrow_native_s3.py || exit 1; \
+		echo "✅ All SSE-S3 tests completed successfully"; \
+		$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true; \
+	else \
+		echo "❌ Failed to start SeaweedFS cluster with SSE-S3"; \
+		echo "=== Server startup logs ==="; \
+		tail -100 weed-test-sse.log 2>/dev/null || echo "No startup log available"; \
+		exit 1; \
+	fi
+
+# Comprehensive SSE-S3 compatibility test
+test-sse-s3-compat: build-weed setup-python
+	@echo "🚀 Starting comprehensive SSE-S3 compatibility tests..."
+	@echo "Starting SeaweedFS cluster with SSE-S3 enabled..."
+	@if $(MAKE) start-seaweedfs-ci ENABLE_SSE_S3=true > weed-test-sse-compat.log 2>&1; then \
+		echo "✅ SeaweedFS cluster started successfully with SSE-S3"; \
+		echo "Running comprehensive SSE-S3 compatibility tests..."; \
+		trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \
+		S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
+		S3_ACCESS_KEY=$(ACCESS_KEY) \
+		S3_SECRET_KEY=$(SECRET_KEY) \
+		BUCKET_NAME=$(BUCKET_NAME) \
+		$(VENV_DIR)/bin/$(PYTHON) test_sse_s3_compatibility.py || exit 1; \
+		echo "✅ All SSE-S3 compatibility tests completed successfully"; \
+		$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true; \
+	else \
+		echo "❌ Failed to start SeaweedFS cluster with SSE-S3"; \
+		echo "=== Server startup logs ==="; \
+		tail -100 weed-test-sse-compat.log 2>/dev/null || echo "No startup log available"; \
+		exit 1; \
+	fi
+
 # CI/CD targets
 ci-test: test-with-server

--- a/test/s3/parquet/README.md
+++ b/test/s3/parquet/README.md
@ -44,6 +44,9 @@ make test-implicit-dir-with-server
 # Run PyArrow native S3 filesystem tests
 make test-native-s3-with-server

+# Run SSE-S3 encryption tests
+make test-sse-s3-compat
+
 # Clean up
 make clean
 ```
@ -119,6 +122,12 @@ dataset = pads.dataset('bucket/dataset', filesystem=s3)  # ✅
  - Tests 3 read methods × 2 dataset sizes = 6 scenarios
  - All tests pass ✅

+- **`test_sse_s3_compatibility.py`** - SSE-S3 encryption compatibility tests
+  - Tests PyArrow native S3 with SSE-S3 server-side encryption
+  - Tests 5 different file sizes (10 to 500,000 rows)
+  - Verifies multipart upload encryption works correctly
+  - All tests pass ✅
+
 ### Implicit Directory Tests
 - **`test_implicit_directory_fix.py`** - Specific tests for the implicit directory fix
  - Tests HEAD request behavior
@ -193,6 +202,7 @@ make test-quick           # Run quick tests with small files only (assumes serve
 make test-implicit-dir-with-server  # Run implicit directory tests with server
 make test-native-s3       # Run PyArrow native S3 tests (assumes server is running)
 make test-native-s3-with-server  # Run PyArrow native S3 tests with server management
+make test-sse-s3-compat   # Run comprehensive SSE-S3 encryption compatibility tests

 # Server Management
 make start-seaweedfs-ci   # Start SeaweedFS in background (CI mode)
@ -213,6 +223,7 @@ The tests are automatically run in GitHub Actions on every push/PR that affects
 - Python versions: 3.9, 3.11, 3.12
 - PyArrow integration tests (s3fs): 20 test combinations
 - PyArrow native S3 tests: 6 test scenarios ✅ **NEW**
+- SSE-S3 encryption tests: 5 file sizes ✅ **NEW**
 - Implicit directory fix tests: 6 test scenarios
 - Go unit tests: 17 test cases

@ -221,7 +232,8 @@ The tests are automatically run in GitHub Actions on every push/PR that affects
 2. Run PyArrow Parquet integration tests (`make test-with-server`)
 3. Run implicit directory fix tests (`make test-implicit-dir-with-server`)
 4. Run PyArrow native S3 filesystem tests (`make test-native-s3-with-server`) ✅ **NEW**
-5. Run Go unit tests for implicit directory handling
+5. Run SSE-S3 encryption compatibility tests (`make test-sse-s3-compat`) ✅ **NEW**
+6. Run Go unit tests for implicit directory handling

 **Triggers**:
 - Push/PR to master (when `weed/s3api/**` or `weed/filer/**` changes)
--- a/test/s3/parquet/test_sse_s3_compatibility.py
+++ b/test/s3/parquet/test_sse_s3_compatibility.py
@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+"""
+Test script for SSE-S3 compatibility with PyArrow native S3 filesystem.
+
+This test specifically targets the SSE-S3 multipart upload bug where
+SeaweedFS panics with "bad IV length" when reading multipart uploads
+that were encrypted with bucket-default SSE-S3.
+
+Requirements:
+    - pyarrow>=10.0.0
+    - boto3>=1.28.0
+
+Environment Variables:
+    S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333)
+    S3_ACCESS_KEY: S3 access key (default: some_access_key1)
+    S3_SECRET_KEY: S3 secret key (default: some_secret_key1)
+    BUCKET_NAME: S3 bucket name (default: test-parquet-bucket)
+
+Usage:
+    # Start SeaweedFS with SSE-S3 enabled
+    make start-seaweedfs-ci ENABLE_SSE_S3=true
+    
+    # Run the test
+    python3 test_sse_s3_compatibility.py
+"""
+
+import os
+import secrets
+import sys
+import logging
+from datetime import datetime
+from typing import Optional
+
+import pyarrow as pa
+import pyarrow.dataset as pads
+import pyarrow.fs as pafs
+import pyarrow.parquet as pq
+
+try:
+    import boto3
+    from botocore.exceptions import ClientError
+    HAS_BOTO3 = True
+except ImportError:
+    HAS_BOTO3 = False
+    logging.error("boto3 is required for this test")
+    sys.exit(1)
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+# Configuration
+S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333")
+S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")
+S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")
+BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
+
+TEST_RUN_ID = secrets.token_hex(8)
+TEST_DIR = f"sse-s3-tests/{TEST_RUN_ID}"
+
+# Test sizes designed to trigger multipart uploads
+# PyArrow typically uses 5MB chunks, so these sizes should trigger multipart
+TEST_SIZES = {
+    "tiny": 10,                    # Single part
+    "small": 1_000,               # Single part
+    "medium": 50_000,             # Single part (~1.5MB)
+    "large": 200_000,             # Multiple parts (~6MB)
+    "very_large": 500_000,        # Multiple parts (~15MB)
+}
+
+
+def create_sample_table(num_rows: int = 5) -> pa.Table:
+    """Create a sample PyArrow table for testing."""
+    return pa.table(
+        {
+            "id": pa.array(range(num_rows), type=pa.int64()),
+            "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),
+            "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
+            "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),
+        }
+    )
+
+
+def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:
+    """Initialize PyArrow's native S3 filesystem."""
+    try:
+        logging.info("Initializing PyArrow S3FileSystem...")
+        
+        # Determine scheme from endpoint
+        if S3_ENDPOINT_URL.startswith("http://"):
+            scheme = "http"
+            endpoint = S3_ENDPOINT_URL[7:]
+        elif S3_ENDPOINT_URL.startswith("https://"):
+            scheme = "https"
+            endpoint = S3_ENDPOINT_URL[8:]
+        else:
+            scheme = "http"
+            endpoint = S3_ENDPOINT_URL
+        
+        s3 = pafs.S3FileSystem(
+            access_key=S3_ACCESS_KEY,
+            secret_key=S3_SECRET_KEY,
+            endpoint_override=endpoint,
+            scheme=scheme,
+            allow_bucket_creation=True,
+            allow_bucket_deletion=True,
+        )
+        
+        logging.info("✓ PyArrow S3FileSystem initialized\n")
+        return s3, scheme, endpoint
+    except Exception as e:
+        logging.exception("✗ Failed to initialize PyArrow S3FileSystem")
+        return None, "", ""
+
+
+def ensure_bucket_exists(scheme: str, endpoint: str) -> bool:
+    """Ensure the test bucket exists using boto3."""
+    try:
+        endpoint_url = f"{scheme}://{endpoint}"
+        s3_client = boto3.client(
+            's3',
+            endpoint_url=endpoint_url,
+            aws_access_key_id=S3_ACCESS_KEY,
+            aws_secret_access_key=S3_SECRET_KEY,
+            region_name='us-east-1',
+        )
+        
+        try:
+            s3_client.head_bucket(Bucket=BUCKET_NAME)
+            logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
+        except ClientError as e:
+            error_code = e.response['Error']['Code']
+            if error_code == '404':
+                logging.info(f"Creating bucket: {BUCKET_NAME}")
+                s3_client.create_bucket(Bucket=BUCKET_NAME)
+                logging.info(f"✓ Bucket created: {BUCKET_NAME}")
+        
+        # Note: SeaweedFS doesn't support GetBucketEncryption API
+        # so we can't verify if SSE-S3 is enabled via API
+        # We assume it's configured correctly in the s3.json config file
+        logging.info(f"✓ Assuming SSE-S3 is configured in s3.json")
+        return True
+            
+    except Exception as e:
+        logging.exception(f"✗ Failed to check bucket: {e}")
+        return False
+
+
+def test_write_read_with_sse(
+    s3: pafs.S3FileSystem,
+    test_name: str,
+    num_rows: int
+) -> tuple[bool, str, int]:
+    """Test writing and reading with SSE-S3 encryption."""
+    try:
+        table = create_sample_table(num_rows)
+        filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet"
+        
+        logging.info(f"  Writing {num_rows:,} rows...")
+        pads.write_dataset(
+            table,
+            filename,
+            filesystem=s3,
+            format="parquet",
+        )
+        
+        logging.info(f"  Reading back...")
+        table_read = pq.read_table(filename, filesystem=s3)
+        
+        if table_read.num_rows != num_rows:
+            return False, f"Row count mismatch: {table_read.num_rows} != {num_rows}", 0
+        
+        return True, "Success", table_read.num_rows
+        
+    except Exception as e:
+        error_msg = f"{type(e).__name__}: {str(e)}"
+        logging.error(f"  ✗ Failed: {error_msg}")
+        return False, error_msg, 0
+
+
+def main():
+    """Run SSE-S3 compatibility tests."""
+    print("=" * 80)
+    print("SSE-S3 Compatibility Tests for PyArrow Native S3")
+    print("Testing Multipart Upload Encryption")
+    print("=" * 80 + "\n")
+
+    print("Configuration:")
+    print(f"  S3 Endpoint: {S3_ENDPOINT_URL}")
+    print(f"  Bucket: {BUCKET_NAME}")
+    print(f"  Test Directory: {TEST_DIR}")
+    print(f"  PyArrow Version: {pa.__version__}")
+    print()
+
+    # Initialize
+    s3, scheme, endpoint = init_s3_filesystem()
+    if s3 is None:
+        print("Cannot proceed without S3 connection")
+        return 1
+
+    # Check bucket and SSE-S3
+    if not ensure_bucket_exists(scheme, endpoint):
+        print("\n⚠ WARNING: SSE-S3 is not enabled on the bucket!")
+        print("This test requires SSE-S3 encryption to be enabled.")
+        print("Please start SeaweedFS with: make start-seaweedfs-ci ENABLE_SSE_S3=true")
+        return 1
+
+    print()
+    results = []
+
+    # Test all sizes
+    for size_name, num_rows in TEST_SIZES.items():
+        print(f"\n{'='*80}")
+        print(f"Testing {size_name} dataset ({num_rows:,} rows)")
+        print(f"{'='*80}")
+        
+        success, message, rows_read = test_write_read_with_sse(
+            s3, size_name, num_rows
+        )
+        results.append((size_name, num_rows, success, message, rows_read))
+        
+        if success:
+            print(f"  ✓ SUCCESS: Read {rows_read:,} rows")
+        else:
+            print(f"  ✗ FAILED: {message}")
+
+    # Summary
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    
+    passed = sum(1 for _, _, success, _, _ in results if success)
+    total = len(results)
+    print(f"\nTotal: {passed}/{total} tests passed\n")
+    
+    print(f"{'Size':<15} {'Rows':>10} {'Status':<10} {'Message':<40}")
+    print("-" * 80)
+    for size_name, num_rows, success, message, rows_read in results:
+        status = "✓ PASS" if success else "✗ FAIL"
+        print(f"{size_name:<15} {num_rows:>10,} {status:<10} {message[:40]}")
+
+    print("\n" + "=" * 80)
+    if passed == total:
+        print("✓ ALL TESTS PASSED WITH SSE-S3!")
+        print("\nThis means:")
+        print("  - SSE-S3 encryption is working correctly")
+        print("  - PyArrow native S3 filesystem is compatible")
+        print("  - Multipart uploads are handled properly")
+    else:
+        print(f"✗ {total - passed} test(s) failed")
+        print("\nPossible issues:")
+        print("  - SSE-S3 multipart upload bug with empty IV")
+        print("  - Encryption/decryption mismatch")
+        print("  - File corruption during upload")
+
+    print("=" * 80 + "\n")
+
+    return 0 if passed == total else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+