From 0e45e5bb103d0d85d1e2bd5ac84c2831b29931fb Mon Sep 17 00:00:00 2001 From: chrislu Date: Wed, 19 Nov 2025 12:48:09 -0800 Subject: [PATCH] add sse-s3 tests --- .github/workflows/s3-parquet-tests.yml | 11 + test/s3/parquet/Makefile | 57 +++- test/s3/parquet/README.md | 14 +- test/s3/parquet/test_sse_s3_compatibility.py | 261 +++++++++++++++++++ 4 files changed, 340 insertions(+), 3 deletions(-) create mode 100755 test/s3/parquet/test_sse_s3_compatibility.py diff --git a/.github/workflows/s3-parquet-tests.yml b/.github/workflows/s3-parquet-tests.yml index a3cc702e5..7c90c984f 100644 --- a/.github/workflows/s3-parquet-tests.yml +++ b/.github/workflows/s3-parquet-tests.yml @@ -97,6 +97,17 @@ jobs: VOLUME_PORT: 8080 MASTER_PORT: 9333 + - name: Run SSE-S3 encryption compatibility tests + run: | + cd test/s3/parquet + make test-sse-s3-compat + env: + SEAWEEDFS_BINARY: weed + S3_PORT: 8333 + FILER_PORT: 8888 + VOLUME_PORT: 8080 + MASTER_PORT: 9333 + - name: Upload test logs on failure if: failure() uses: actions/upload-artifact@v4 diff --git a/test/s3/parquet/Makefile b/test/s3/parquet/Makefile index 079ca5b9c..16fab9a3d 100644 --- a/test/s3/parquet/Makefile +++ b/test/s3/parquet/Makefile @@ -13,6 +13,7 @@ SECRET_KEY ?= some_secret_key1 VOLUME_MAX_SIZE_MB ?= 50 VOLUME_MAX_COUNT ?= 100 BUCKET_NAME ?= test-parquet-bucket +ENABLE_SSE_S3 ?= false # Python configuration PYTHON ?= python3 @@ -29,7 +30,7 @@ GREEN := \033[0;32m YELLOW := \033[1;33m NC := \033[0m # No Color -.PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-native-s3 test-native-s3-with-server test-quick test-with-server +.PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-native-s3 test-native-s3-with-server test-native-s3-with-sse test-quick test-sse-s3-compat test-with-server all: test @@ -50,6 +51,8 @@ help: @echo " test-implicit-dir-with-server - Test implicit directory fix with server management" @echo " test-native-s3 - Test PyArrow's native S3 filesystem (assumes server running)" @echo " test-native-s3-with-server - Test PyArrow's native S3 filesystem with server management" + @echo " test-native-s3-with-sse - Test PyArrow's native S3 with SSE-S3 encryption enabled" + @echo " test-sse-s3-compat - Comprehensive SSE-S3 compatibility test (multipart uploads)" @echo " setup-python - Setup Python virtual environment and install dependencies" @echo " check-python - Check if Python and required packages are available" @echo " start-seaweedfs - Start SeaweedFS server for testing" @@ -133,7 +136,13 @@ start-seaweedfs-ci: check-binary # Start filer server with embedded S3 @echo "Starting filer server with embedded S3..." - @printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json + @if [ "$(ENABLE_SSE_S3)" = "true" ]; then \ + echo " SSE-S3 encryption: ENABLED"; \ + printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}],"buckets":[{"name":"$(BUCKET_NAME)","encryption":{"sseS3":{"enabled":true}}}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json; \ + else \ + echo " SSE-S3 encryption: DISABLED"; \ + printf '{"identities":[{"name":"%s","credentials":[{"accessKey":"%s","secretKey":"%s"}],"actions":["Admin","Read","Write"]}]}' "$(ACCESS_KEY)" "$(ACCESS_KEY)" "$(SECRET_KEY)" > /tmp/seaweedfs-parquet-s3.json; \ + fi @AWS_ACCESS_KEY_ID=$(ACCESS_KEY) AWS_SECRET_ACCESS_KEY=$(SECRET_KEY) nohup $(SEAWEEDFS_BINARY) filer -port=$(FILER_PORT) -port.grpc=$$(( $(FILER_PORT) + 10000 )) -master=127.0.0.1:$(MASTER_PORT) -dataCenter=defaultDataCenter -ip=127.0.0.1 -s3 -s3.port=$(S3_PORT) -s3.config=/tmp/seaweedfs-parquet-s3.json > /tmp/seaweedfs-parquet-filer.log 2>&1 & @sleep 5 @@ -394,6 +403,50 @@ test-native-s3-with-server: build-weed setup-python exit 1; \ fi +# Test PyArrow's native S3 filesystem with SSE-S3 encryption +test-native-s3-with-sse: build-weed setup-python + @echo "šŸš€ Starting PyArrow native S3 filesystem tests with SSE-S3 encryption..." + @echo "Starting SeaweedFS cluster with SSE-S3 enabled..." + @if $(MAKE) start-seaweedfs-ci ENABLE_SSE_S3=true > weed-test-sse.log 2>&1; then \ + echo "āœ… SeaweedFS cluster started successfully with SSE-S3"; \ + echo "Running PyArrow native S3 filesystem tests with SSE-S3..."; \ + trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \ + S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \ + S3_ACCESS_KEY=$(ACCESS_KEY) \ + S3_SECRET_KEY=$(SECRET_KEY) \ + BUCKET_NAME=$(BUCKET_NAME) \ + $(VENV_DIR)/bin/$(PYTHON) test_pyarrow_native_s3.py || exit 1; \ + echo "āœ… All SSE-S3 tests completed successfully"; \ + $(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true; \ + else \ + echo "āŒ Failed to start SeaweedFS cluster with SSE-S3"; \ + echo "=== Server startup logs ==="; \ + tail -100 weed-test-sse.log 2>/dev/null || echo "No startup log available"; \ + exit 1; \ + fi + +# Comprehensive SSE-S3 compatibility test +test-sse-s3-compat: build-weed setup-python + @echo "šŸš€ Starting comprehensive SSE-S3 compatibility tests..." + @echo "Starting SeaweedFS cluster with SSE-S3 enabled..." + @if $(MAKE) start-seaweedfs-ci ENABLE_SSE_S3=true > weed-test-sse-compat.log 2>&1; then \ + echo "āœ… SeaweedFS cluster started successfully with SSE-S3"; \ + echo "Running comprehensive SSE-S3 compatibility tests..."; \ + trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \ + S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \ + S3_ACCESS_KEY=$(ACCESS_KEY) \ + S3_SECRET_KEY=$(SECRET_KEY) \ + BUCKET_NAME=$(BUCKET_NAME) \ + $(VENV_DIR)/bin/$(PYTHON) test_sse_s3_compatibility.py || exit 1; \ + echo "āœ… All SSE-S3 compatibility tests completed successfully"; \ + $(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true; \ + else \ + echo "āŒ Failed to start SeaweedFS cluster with SSE-S3"; \ + echo "=== Server startup logs ==="; \ + tail -100 weed-test-sse-compat.log 2>/dev/null || echo "No startup log available"; \ + exit 1; \ + fi + # CI/CD targets ci-test: test-with-server diff --git a/test/s3/parquet/README.md b/test/s3/parquet/README.md index 4e1c875f0..ed65e4cbb 100644 --- a/test/s3/parquet/README.md +++ b/test/s3/parquet/README.md @@ -44,6 +44,9 @@ make test-implicit-dir-with-server # Run PyArrow native S3 filesystem tests make test-native-s3-with-server +# Run SSE-S3 encryption tests +make test-sse-s3-compat + # Clean up make clean ``` @@ -119,6 +122,12 @@ dataset = pads.dataset('bucket/dataset', filesystem=s3) # āœ… - Tests 3 read methods Ɨ 2 dataset sizes = 6 scenarios - All tests pass āœ… +- **`test_sse_s3_compatibility.py`** - SSE-S3 encryption compatibility tests + - Tests PyArrow native S3 with SSE-S3 server-side encryption + - Tests 5 different file sizes (10 to 500,000 rows) + - Verifies multipart upload encryption works correctly + - All tests pass āœ… + ### Implicit Directory Tests - **`test_implicit_directory_fix.py`** - Specific tests for the implicit directory fix - Tests HEAD request behavior @@ -193,6 +202,7 @@ make test-quick # Run quick tests with small files only (assumes serve make test-implicit-dir-with-server # Run implicit directory tests with server make test-native-s3 # Run PyArrow native S3 tests (assumes server is running) make test-native-s3-with-server # Run PyArrow native S3 tests with server management +make test-sse-s3-compat # Run comprehensive SSE-S3 encryption compatibility tests # Server Management make start-seaweedfs-ci # Start SeaweedFS in background (CI mode) @@ -213,6 +223,7 @@ The tests are automatically run in GitHub Actions on every push/PR that affects - Python versions: 3.9, 3.11, 3.12 - PyArrow integration tests (s3fs): 20 test combinations - PyArrow native S3 tests: 6 test scenarios āœ… **NEW** +- SSE-S3 encryption tests: 5 file sizes āœ… **NEW** - Implicit directory fix tests: 6 test scenarios - Go unit tests: 17 test cases @@ -221,7 +232,8 @@ The tests are automatically run in GitHub Actions on every push/PR that affects 2. Run PyArrow Parquet integration tests (`make test-with-server`) 3. Run implicit directory fix tests (`make test-implicit-dir-with-server`) 4. Run PyArrow native S3 filesystem tests (`make test-native-s3-with-server`) āœ… **NEW** -5. Run Go unit tests for implicit directory handling +5. Run SSE-S3 encryption compatibility tests (`make test-sse-s3-compat`) āœ… **NEW** +6. Run Go unit tests for implicit directory handling **Triggers**: - Push/PR to master (when `weed/s3api/**` or `weed/filer/**` changes) diff --git a/test/s3/parquet/test_sse_s3_compatibility.py b/test/s3/parquet/test_sse_s3_compatibility.py new file mode 100755 index 000000000..ccadaf44c --- /dev/null +++ b/test/s3/parquet/test_sse_s3_compatibility.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +""" +Test script for SSE-S3 compatibility with PyArrow native S3 filesystem. + +This test specifically targets the SSE-S3 multipart upload bug where +SeaweedFS panics with "bad IV length" when reading multipart uploads +that were encrypted with bucket-default SSE-S3. + +Requirements: + - pyarrow>=10.0.0 + - boto3>=1.28.0 + +Environment Variables: + S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333) + S3_ACCESS_KEY: S3 access key (default: some_access_key1) + S3_SECRET_KEY: S3 secret key (default: some_secret_key1) + BUCKET_NAME: S3 bucket name (default: test-parquet-bucket) + +Usage: + # Start SeaweedFS with SSE-S3 enabled + make start-seaweedfs-ci ENABLE_SSE_S3=true + + # Run the test + python3 test_sse_s3_compatibility.py +""" + +import os +import secrets +import sys +import logging +from datetime import datetime +from typing import Optional + +import pyarrow as pa +import pyarrow.dataset as pads +import pyarrow.fs as pafs +import pyarrow.parquet as pq + +try: + import boto3 + from botocore.exceptions import ClientError + HAS_BOTO3 = True +except ImportError: + HAS_BOTO3 = False + logging.error("boto3 is required for this test") + sys.exit(1) + +logging.basicConfig(level=logging.INFO, format="%(message)s") + +# Configuration +S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333") +S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1") +S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1") +BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket") + +TEST_RUN_ID = secrets.token_hex(8) +TEST_DIR = f"sse-s3-tests/{TEST_RUN_ID}" + +# Test sizes designed to trigger multipart uploads +# PyArrow typically uses 5MB chunks, so these sizes should trigger multipart +TEST_SIZES = { + "tiny": 10, # Single part + "small": 1_000, # Single part + "medium": 50_000, # Single part (~1.5MB) + "large": 200_000, # Multiple parts (~6MB) + "very_large": 500_000, # Multiple parts (~15MB) +} + + +def create_sample_table(num_rows: int = 5) -> pa.Table: + """Create a sample PyArrow table for testing.""" + return pa.table( + { + "id": pa.array(range(num_rows), type=pa.int64()), + "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()), + "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()), + "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()), + } + ) + + +def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]: + """Initialize PyArrow's native S3 filesystem.""" + try: + logging.info("Initializing PyArrow S3FileSystem...") + + # Determine scheme from endpoint + if S3_ENDPOINT_URL.startswith("http://"): + scheme = "http" + endpoint = S3_ENDPOINT_URL[7:] + elif S3_ENDPOINT_URL.startswith("https://"): + scheme = "https" + endpoint = S3_ENDPOINT_URL[8:] + else: + scheme = "http" + endpoint = S3_ENDPOINT_URL + + s3 = pafs.S3FileSystem( + access_key=S3_ACCESS_KEY, + secret_key=S3_SECRET_KEY, + endpoint_override=endpoint, + scheme=scheme, + allow_bucket_creation=True, + allow_bucket_deletion=True, + ) + + logging.info("āœ“ PyArrow S3FileSystem initialized\n") + return s3, scheme, endpoint + except Exception as e: + logging.exception("āœ— Failed to initialize PyArrow S3FileSystem") + return None, "", "" + + +def ensure_bucket_exists(scheme: str, endpoint: str) -> bool: + """Ensure the test bucket exists using boto3.""" + try: + endpoint_url = f"{scheme}://{endpoint}" + s3_client = boto3.client( + 's3', + endpoint_url=endpoint_url, + aws_access_key_id=S3_ACCESS_KEY, + aws_secret_access_key=S3_SECRET_KEY, + region_name='us-east-1', + ) + + try: + s3_client.head_bucket(Bucket=BUCKET_NAME) + logging.info(f"āœ“ Bucket exists: {BUCKET_NAME}") + except ClientError as e: + error_code = e.response['Error']['Code'] + if error_code == '404': + logging.info(f"Creating bucket: {BUCKET_NAME}") + s3_client.create_bucket(Bucket=BUCKET_NAME) + logging.info(f"āœ“ Bucket created: {BUCKET_NAME}") + + # Note: SeaweedFS doesn't support GetBucketEncryption API + # so we can't verify if SSE-S3 is enabled via API + # We assume it's configured correctly in the s3.json config file + logging.info(f"āœ“ Assuming SSE-S3 is configured in s3.json") + return True + + except Exception as e: + logging.exception(f"āœ— Failed to check bucket: {e}") + return False + + +def test_write_read_with_sse( + s3: pafs.S3FileSystem, + test_name: str, + num_rows: int +) -> tuple[bool, str, int]: + """Test writing and reading with SSE-S3 encryption.""" + try: + table = create_sample_table(num_rows) + filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet" + + logging.info(f" Writing {num_rows:,} rows...") + pads.write_dataset( + table, + filename, + filesystem=s3, + format="parquet", + ) + + logging.info(f" Reading back...") + table_read = pq.read_table(filename, filesystem=s3) + + if table_read.num_rows != num_rows: + return False, f"Row count mismatch: {table_read.num_rows} != {num_rows}", 0 + + return True, "Success", table_read.num_rows + + except Exception as e: + error_msg = f"{type(e).__name__}: {str(e)}" + logging.error(f" āœ— Failed: {error_msg}") + return False, error_msg, 0 + + +def main(): + """Run SSE-S3 compatibility tests.""" + print("=" * 80) + print("SSE-S3 Compatibility Tests for PyArrow Native S3") + print("Testing Multipart Upload Encryption") + print("=" * 80 + "\n") + + print("Configuration:") + print(f" S3 Endpoint: {S3_ENDPOINT_URL}") + print(f" Bucket: {BUCKET_NAME}") + print(f" Test Directory: {TEST_DIR}") + print(f" PyArrow Version: {pa.__version__}") + print() + + # Initialize + s3, scheme, endpoint = init_s3_filesystem() + if s3 is None: + print("Cannot proceed without S3 connection") + return 1 + + # Check bucket and SSE-S3 + if not ensure_bucket_exists(scheme, endpoint): + print("\n⚠ WARNING: SSE-S3 is not enabled on the bucket!") + print("This test requires SSE-S3 encryption to be enabled.") + print("Please start SeaweedFS with: make start-seaweedfs-ci ENABLE_SSE_S3=true") + return 1 + + print() + results = [] + + # Test all sizes + for size_name, num_rows in TEST_SIZES.items(): + print(f"\n{'='*80}") + print(f"Testing {size_name} dataset ({num_rows:,} rows)") + print(f"{'='*80}") + + success, message, rows_read = test_write_read_with_sse( + s3, size_name, num_rows + ) + results.append((size_name, num_rows, success, message, rows_read)) + + if success: + print(f" āœ“ SUCCESS: Read {rows_read:,} rows") + else: + print(f" āœ— FAILED: {message}") + + # Summary + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + + passed = sum(1 for _, _, success, _, _ in results if success) + total = len(results) + print(f"\nTotal: {passed}/{total} tests passed\n") + + print(f"{'Size':<15} {'Rows':>10} {'Status':<10} {'Message':<40}") + print("-" * 80) + for size_name, num_rows, success, message, rows_read in results: + status = "āœ“ PASS" if success else "āœ— FAIL" + print(f"{size_name:<15} {num_rows:>10,} {status:<10} {message[:40]}") + + print("\n" + "=" * 80) + if passed == total: + print("āœ“ ALL TESTS PASSED WITH SSE-S3!") + print("\nThis means:") + print(" - SSE-S3 encryption is working correctly") + print(" - PyArrow native S3 filesystem is compatible") + print(" - Multipart uploads are handled properly") + else: + print(f"āœ— {total - passed} test(s) failed") + print("\nPossible issues:") + print(" - SSE-S3 multipart upload bug with empty IV") + print(" - Encryption/decryption mismatch") + print(" - File corruption during upload") + + print("=" * 80 + "\n") + + return 0 if passed == total else 1 + + +if __name__ == "__main__": + sys.exit(main()) +