PyArrow native S3 filesystem

2 weeks ago · 08ed4af4ec
5 changed files with 589 additions and 2 deletions
--- a/.github/workflows/s3-parquet-tests.yml
+++ b/.github/workflows/s3-parquet-tests.yml
@ -86,6 +86,17 @@ jobs:
          VOLUME_PORT: 8080
          MASTER_PORT: 9333
      
+      - name: Run PyArrow native S3 filesystem tests
+        run: |
+          cd test/s3/parquet
+          make test-native-s3-with-server
+        env:
+          SEAWEEDFS_BINARY: weed
+          S3_PORT: 8333
+          FILER_PORT: 8888
+          VOLUME_PORT: 8080
+          MASTER_PORT: 9333
+      
      - name: Upload test logs on failure
        if: failure()
        uses: actions/upload-artifact@v4
--- a/test/s3/parquet/Makefile
+++ b/test/s3/parquet/Makefile
@ -29,7 +29,7 @@ GREEN := \033[0;32m
 YELLOW := \033[1;33m
 NC := \033[0m # No Color

-.PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-quick test-with-server
+.PHONY: all build-weed check-binary check-python ci-test clean debug-logs debug-status help manual-start manual-stop setup-python start-seaweedfs start-seaweedfs-ci stop-seaweedfs stop-seaweedfs-safe test test-implicit-dir test-implicit-dir-with-server test-native-s3 test-native-s3-with-server test-quick test-with-server

 all: test

@ -48,6 +48,8 @@ help:
 	@echo "  test-quick        - Run quick tests with small files only (sets TEST_QUICK=1)"
 	@echo "  test-implicit-dir - Test implicit directory fix for s3fs compatibility"
 	@echo "  test-implicit-dir-with-server - Test implicit directory fix with server management"
+	@echo "  test-native-s3    - Test PyArrow's native S3 filesystem (assumes server running)"
+	@echo "  test-native-s3-with-server - Test PyArrow's native S3 filesystem with server management"
 	@echo "  setup-python      - Setup Python virtual environment and install dependencies"
 	@echo "  check-python      - Check if Python and required packages are available"
 	@echo "  start-seaweedfs   - Start SeaweedFS server for testing"
@ -360,6 +362,38 @@ manual-start: start-seaweedfs

 manual-stop: stop-seaweedfs clean

+# Test PyArrow's native S3 filesystem
+test-native-s3: setup-python
+	@echo "$(YELLOW)Running PyArrow native S3 filesystem tests...$(NC)"
+	@echo "$(YELLOW)Assuming SeaweedFS is already running on localhost:$(S3_PORT)$(NC)"
+	@S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
+	 S3_ACCESS_KEY=$(ACCESS_KEY) \
+	 S3_SECRET_KEY=$(SECRET_KEY) \
+	 BUCKET_NAME=$(BUCKET_NAME) \
+	 $(VENV_DIR)/bin/$(PYTHON) test_pyarrow_native_s3.py
+
+# Test PyArrow's native S3 filesystem with automatic server management
+test-native-s3-with-server: build-weed setup-python
+	@echo "🚀 Starting PyArrow native S3 filesystem tests with automated server management..."
+	@echo "Starting SeaweedFS cluster..."
+	@if $(MAKE) start-seaweedfs-ci > weed-test.log 2>&1; then \
+		echo "✅ SeaweedFS cluster started successfully"; \
+		echo "Running PyArrow native S3 filesystem tests..."; \
+		trap '$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true' EXIT; \
+		S3_ENDPOINT_URL=http://localhost:$(S3_PORT) \
+		S3_ACCESS_KEY=$(ACCESS_KEY) \
+		S3_SECRET_KEY=$(SECRET_KEY) \
+		BUCKET_NAME=$(BUCKET_NAME) \
+		$(VENV_DIR)/bin/$(PYTHON) test_pyarrow_native_s3.py || exit 1; \
+		echo "✅ All tests completed successfully"; \
+		$(MAKE) -C $(TEST_DIR) stop-seaweedfs-safe || true; \
+	else \
+		echo "❌ Failed to start SeaweedFS cluster"; \
+		echo "=== Server startup logs ==="; \
+		tail -100 weed-test.log 2>/dev/null || echo "No startup log available"; \
+		exit 1; \
+	fi
+
 # CI/CD targets
 ci-test: test-with-server

--- a/test/s3/parquet/README.md
+++ b/test/s3/parquet/README.md
@ -10,6 +10,22 @@ SeaweedFS implements implicit directory detection to improve compatibility with

 ## Quick Start

+### Running the Example Script
+
+```bash
+# Start SeaweedFS server
+make start-seaweedfs-ci
+
+# Run the example script
+python3 example_pyarrow_native.py
+
+# Or with uv (if available)
+uv run example_pyarrow_native.py
+
+# Stop the server when done
+make stop-seaweedfs-safe
+```
+
 ### Running Tests

 ```bash
@ -25,12 +41,17 @@ make test-quick
 # Run implicit directory fix tests
 make test-implicit-dir-with-server

+# Run PyArrow native S3 filesystem tests
+make test-native-s3-with-server
+
 # Clean up
 make clean
 ```

 ### Using PyArrow with SeaweedFS

+#### Option 1: Using s3fs (recommended for compatibility)
+
 ```python
 import pyarrow as pa
 import pyarrow.parquet as pq
@ -55,13 +76,49 @@ table = pq.read_table('bucket/dataset', filesystem=fs)   # ✅
 dataset = pq.ParquetDataset('bucket/dataset', filesystem=fs)  # ✅
 ```

+#### Option 2: Using PyArrow's native S3 filesystem (pure PyArrow)
+
+```python
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pyarrow.dataset as pads
+import pyarrow.fs as pafs
+
+# Configure PyArrow's native S3 filesystem
+s3 = pafs.S3FileSystem(
+    access_key='your_access_key',
+    secret_key='your_secret_key',
+    endpoint_override='localhost:8333',
+    scheme='http',
+    allow_bucket_creation=True,
+    allow_bucket_deletion=True
+)
+
+# Write dataset
+table = pa.table({'id': [1, 2, 3], 'value': ['a', 'b', 'c']})
+pads.write_dataset(table, 'bucket/dataset', filesystem=s3)
+
+# Read dataset (all methods work!)
+table = pq.read_table('bucket/dataset', filesystem=s3)  # ✅
+dataset = pq.ParquetDataset('bucket/dataset', filesystem=s3)  # ✅
+dataset = pads.dataset('bucket/dataset', filesystem=s3)  # ✅
+```
+
 ## Test Files

 ### Main Test Suite
 - **`s3_parquet_test.py`** - Comprehensive PyArrow test suite
  - Tests 2 write methods × 5 read methods × 2 dataset sizes = 20 combinations
+  - Uses s3fs library for S3 operations
  - All tests pass with the implicit directory fix ✅

+### PyArrow Native S3 Tests
+- **`test_pyarrow_native_s3.py`** - PyArrow's native S3 filesystem tests
+  - Tests PyArrow's built-in S3FileSystem (pyarrow.fs.S3FileSystem)
+  - Pure PyArrow solution without s3fs dependency
+  - Tests 3 read methods × 2 dataset sizes = 6 scenarios
+  - All tests pass ✅
+
 ### Implicit Directory Tests
 - **`test_implicit_directory_fix.py`** - Specific tests for the implicit directory fix
  - Tests HEAD request behavior
@ -69,6 +126,12 @@ dataset = pq.ParquetDataset('bucket/dataset', filesystem=fs)  # ✅
  - Tests PyArrow dataset reading
  - All 6 tests pass ✅

+### Examples
+- **`example_pyarrow_native.py`** - Simple standalone example
+  - Demonstrates PyArrow's native S3 filesystem usage
+  - Can be run with `uv run` or regular Python
+  - Minimal dependencies (pyarrow, boto3)
+
 ### Configuration
 - **`Makefile`** - Build and test automation
 - **`requirements.txt`** - Python dependencies (pyarrow, s3fs, boto3)
@ -128,6 +191,8 @@ make test                 # Run full tests (assumes server is already running)
 make test-with-server     # Run full PyArrow test suite with server (small + large files)
 make test-quick           # Run quick tests with small files only (assumes server is running)
 make test-implicit-dir-with-server  # Run implicit directory tests with server
+make test-native-s3       # Run PyArrow native S3 tests (assumes server is running)
+make test-native-s3-with-server  # Run PyArrow native S3 tests with server management

 # Server Management
 make start-seaweedfs-ci   # Start SeaweedFS in background (CI mode)
@ -146,10 +211,18 @@ The tests are automatically run in GitHub Actions on every push/PR that affects

 **Test Matrix**:
 - Python versions: 3.9, 3.11, 3.12
- PyArrow integration tests: 20 test combinations
+- PyArrow integration tests (s3fs): 20 test combinations
+- PyArrow native S3 tests: 6 test scenarios ✅ **NEW**
 - Implicit directory fix tests: 6 test scenarios
 - Go unit tests: 17 test cases

+**Test Steps** (run for each Python version):
+1. Build SeaweedFS
+2. Run PyArrow Parquet integration tests (`make test-with-server`)
+3. Run implicit directory fix tests (`make test-implicit-dir-with-server`)
+4. Run PyArrow native S3 filesystem tests (`make test-native-s3-with-server`) ✅ **NEW**
+5. Run Go unit tests for implicit directory handling
+
 **Triggers**:
 - Push/PR to master (when `weed/s3api/**` or `weed/filer/**` changes)
 - Manual trigger via GitHub UI (workflow_dispatch)
--- a/test/s3/parquet/example_pyarrow_native.py
+++ b/test/s3/parquet/example_pyarrow_native.py
@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+# /// script
+# dependencies = [
+#     "pyarrow>=22",
+#     "boto3>=1.28.0",
+# ]
+# ///
+
+"""
+Simple example of using PyArrow's native S3 filesystem with SeaweedFS.
+
+This is a minimal example demonstrating how to write and read Parquet files
+using PyArrow's built-in S3FileSystem without any additional dependencies
+like s3fs.
+
+Usage:
+    # Set environment variables
+    export S3_ENDPOINT_URL=localhost:8333
+    export S3_ACCESS_KEY=some_access_key1
+    export S3_SECRET_KEY=some_secret_key1
+    export BUCKET_NAME=test-parquet-bucket
+
+    # Run the script
+    python3 example_pyarrow_native.py
+    
+    # Or run with uv (if available)
+    uv run example_pyarrow_native.py
+"""
+
+import os
+import secrets
+
+import pyarrow as pa
+import pyarrow.dataset as pads
+import pyarrow.fs as pafs
+import pyarrow.parquet as pq
+
+# Configuration
+BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
+S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "localhost:8333")
+S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY", "some_access_key1")
+S3_SECRET_KEY = os.getenv("S3_SECRET_KEY", "some_secret_key1")
+
+# Determine scheme from endpoint
+if S3_ENDPOINT_URL.startswith("http://"):
+    scheme = "http"
+    endpoint = S3_ENDPOINT_URL[7:]
+elif S3_ENDPOINT_URL.startswith("https://"):
+    scheme = "https"
+    endpoint = S3_ENDPOINT_URL[8:]
+else:
+    scheme = "http"  # Default to http for localhost
+    endpoint = S3_ENDPOINT_URL
+
+print(f"Connecting to S3 endpoint: {scheme}://{endpoint}")
+
+# Initialize PyArrow's NATIVE S3 filesystem
+s3 = pafs.S3FileSystem(
+    access_key=S3_ACCESS_KEY,
+    secret_key=S3_SECRET_KEY,
+    endpoint_override=endpoint,
+    scheme=scheme,
+    allow_bucket_creation=True,
+    allow_bucket_deletion=True,
+)
+
+print(f"✓ Connected to S3 endpoint")
+
+
+def create_sample_table(num_rows: int = 5) -> pa.Table:
+    """Create a sample PyArrow table for testing."""
+    return pa.table(
+        {
+            "id": pa.array(range(num_rows), type=pa.int64()),
+            "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),
+            "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
+            "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),
+        }
+    )
+
+
+# Create bucket if needed (using boto3)
+try:
+    import boto3
+    from botocore.exceptions import ClientError
+    
+    s3_client = boto3.client(
+        's3',
+        endpoint_url=f"{scheme}://{endpoint}",
+        aws_access_key_id=S3_ACCESS_KEY,
+        aws_secret_access_key=S3_SECRET_KEY,
+        region_name='us-east-1',
+    )
+    
+    try:
+        s3_client.head_bucket(Bucket=BUCKET_NAME)
+        print(f"✓ Bucket exists: {BUCKET_NAME}")
+    except ClientError as e:
+        if e.response['Error']['Code'] == '404':
+            print(f"Creating bucket: {BUCKET_NAME}")
+            s3_client.create_bucket(Bucket=BUCKET_NAME)
+            print(f"✓ Bucket created: {BUCKET_NAME}")
+except ImportError:
+    print("Warning: boto3 not available, assuming bucket exists")
+
+# Generate a unique filename
+filename = f"{BUCKET_NAME}/dataset-{secrets.token_hex(8)}/test.parquet"
+
+print(f"\nWriting Parquet dataset to: {filename}")
+
+# Write dataset
+table = create_sample_table(200_000)
+pads.write_dataset(
+    table,
+    filename,
+    filesystem=s3,
+    format="parquet",
+)
+
+print(f"✓ Wrote {table.num_rows:,} rows")
+
+# Read with pq.read_table
+print(f"\nReading with pq.read_table...")
+table_read = pq.read_table(filename, filesystem=s3)
+print(f"✓ Read {table_read.num_rows:,} rows")
+
+# Read with pq.ParquetDataset
+print(f"\nReading with pq.ParquetDataset...")
+dataset = pq.ParquetDataset(filename, filesystem=s3)
+table_dataset = dataset.read()
+print(f"✓ Read {table_dataset.num_rows:,} rows")
+
+# Read with pads.dataset
+print(f"\nReading with pads.dataset...")
+dataset_pads = pads.dataset(filename, filesystem=s3)
+table_pads = dataset_pads.to_table()
+print(f"✓ Read {table_pads.num_rows:,} rows")
+
+print("\n✅ All operations completed successfully!")
+print(f"\nFile written to: {filename}")
+print("You can verify the file using the SeaweedFS S3 API or weed shell")
+
--- a/test/s3/parquet/test_pyarrow_native_s3.py
+++ b/test/s3/parquet/test_pyarrow_native_s3.py
@ -0,0 +1,327 @@
+#!/usr/bin/env python3
+"""
+Test script for PyArrow's NATIVE S3 filesystem with SeaweedFS.
+
+This test uses PyArrow's built-in S3FileSystem (pyarrow.fs.S3FileSystem)
+instead of s3fs, providing a pure PyArrow solution for reading and writing
+Parquet files to S3-compatible storage.
+
+Requirements:
+    - pyarrow>=10.0.0
+
+Environment Variables:
+    S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333)
+    S3_ACCESS_KEY: S3 access key (default: some_access_key1)
+    S3_SECRET_KEY: S3 secret key (default: some_secret_key1)
+    BUCKET_NAME: S3 bucket name (default: test-parquet-bucket)
+    TEST_QUICK: Run only small/quick tests (default: 0, set to 1 for quick mode)
+
+Usage:
+    # Run with default environment variables
+    python3 test_pyarrow_native_s3.py
+
+    # Run with custom environment variables
+    S3_ENDPOINT_URL=localhost:8333 \
+    S3_ACCESS_KEY=mykey \
+    S3_SECRET_KEY=mysecret \
+    BUCKET_NAME=mybucket \
+    python3 test_pyarrow_native_s3.py
+"""
+
+import os
+import secrets
+import sys
+import logging
+from datetime import datetime
+from typing import Optional
+
+import pyarrow as pa
+import pyarrow.dataset as pads
+import pyarrow.fs as pafs
+import pyarrow.parquet as pq
+
+try:
+    import boto3
+    from botocore.exceptions import ClientError
+    HAS_BOTO3 = True
+except ImportError:
+    HAS_BOTO3 = False
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+# Configuration from environment variables with defaults
+S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333")
+S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")
+S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")
+BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
+TEST_QUICK = os.getenv("TEST_QUICK", "0") == "1"
+
+# Create randomized test directory
+TEST_RUN_ID = secrets.token_hex(8)
+TEST_DIR = f"parquet-native-tests/{TEST_RUN_ID}"
+
+# Test file sizes
+TEST_SIZES = {
+    "small": 5,
+    "large": 200_000,  # This will create multiple row groups
+}
+
+# Filter to only small tests if quick mode is enabled
+if TEST_QUICK:
+    TEST_SIZES = {"small": TEST_SIZES["small"]}
+    logging.info("Quick test mode enabled - running only small tests")
+
+
+def create_sample_table(num_rows: int = 5) -> pa.Table:
+    """Create a sample PyArrow table for testing."""
+    return pa.table(
+        {
+            "id": pa.array(range(num_rows), type=pa.int64()),
+            "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),
+            "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
+            "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),
+        }
+    )
+
+
+def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:
+    """Initialize PyArrow's native S3 filesystem.
+    
+    Returns:
+        tuple: (S3FileSystem instance, scheme, endpoint)
+    """
+    try:
+        logging.info("Initializing PyArrow S3FileSystem...")
+        logging.info(f"  Endpoint: {S3_ENDPOINT_URL}")
+        logging.info(f"  Bucket: {BUCKET_NAME}")
+        
+        # Determine scheme from endpoint
+        if S3_ENDPOINT_URL.startswith("http://"):
+            scheme = "http"
+            endpoint = S3_ENDPOINT_URL[7:]  # Remove http://
+        elif S3_ENDPOINT_URL.startswith("https://"):
+            scheme = "https"
+            endpoint = S3_ENDPOINT_URL[8:]  # Remove https://
+        else:
+            # Default to http for localhost
+            scheme = "http"
+            endpoint = S3_ENDPOINT_URL
+        
+        # Enable bucket creation and deletion for testing
+        s3 = pafs.S3FileSystem(
+            access_key=S3_ACCESS_KEY,
+            secret_key=S3_SECRET_KEY,
+            endpoint_override=endpoint,
+            scheme=scheme,
+            allow_bucket_creation=True,
+            allow_bucket_deletion=True,
+        )
+        
+        logging.info("✓ PyArrow S3FileSystem initialized successfully\n")
+        return s3, scheme, endpoint
+    except Exception as e:
+        logging.exception("✗ Failed to initialize PyArrow S3FileSystem")
+        return None, "", ""
+
+
+def ensure_bucket_exists_boto3(scheme: str, endpoint: str) -> bool:
+    """Ensure the test bucket exists using boto3."""
+    if not HAS_BOTO3:
+        logging.error("boto3 is required for bucket creation")
+        return False
+    
+    try:
+        # Create boto3 client
+        endpoint_url = f"{scheme}://{endpoint}"
+        s3_client = boto3.client(
+            's3',
+            endpoint_url=endpoint_url,
+            aws_access_key_id=S3_ACCESS_KEY,
+            aws_secret_access_key=S3_SECRET_KEY,
+            region_name='us-east-1',
+        )
+        
+        # Check if bucket exists
+        try:
+            s3_client.head_bucket(Bucket=BUCKET_NAME)
+            logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
+            return True
+        except ClientError as e:
+            error_code = e.response['Error']['Code']
+            if error_code == '404':
+                # Bucket doesn't exist, create it
+                logging.info(f"Creating bucket: {BUCKET_NAME}")
+                s3_client.create_bucket(Bucket=BUCKET_NAME)
+                logging.info(f"✓ Bucket created: {BUCKET_NAME}")
+                return True
+            else:
+                raise
+    except Exception as e:
+        logging.exception(f"✗ Failed to create/check bucket: {e}")
+        return False
+
+
+def ensure_bucket_exists(s3: pafs.S3FileSystem) -> bool:
+    """Ensure the test bucket exists using PyArrow's native S3FileSystem."""
+    try:
+        # Check if bucket exists by trying to list it
+        try:
+            file_info = s3.get_file_info(BUCKET_NAME)
+            if file_info.type == pafs.FileType.Directory:
+                logging.info(f"✓ Bucket exists: {BUCKET_NAME}")
+                return True
+        except Exception:
+            pass
+        
+        # Try to create the bucket
+        logging.info(f"Creating bucket: {BUCKET_NAME}")
+        s3.create_dir(BUCKET_NAME)
+        logging.info(f"✓ Bucket created: {BUCKET_NAME}")
+        return True
+    except Exception as e:
+        logging.error(f"✗ Failed to create/check bucket with PyArrow: {e}")
+        return False
+
+
+def test_write_and_read(s3: pafs.S3FileSystem, test_name: str, num_rows: int) -> tuple[bool, str]:
+    """Test writing and reading a Parquet dataset using PyArrow's native S3 filesystem."""
+    try:
+        table = create_sample_table(num_rows)
+        
+        # Write using pads.write_dataset
+        filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet"
+        logging.info(f"  Writing {num_rows:,} rows to {filename}...")
+        
+        pads.write_dataset(
+            table,
+            filename,
+            filesystem=s3,
+            format="parquet",
+        )
+        logging.info(f"  ✓ Write completed")
+        
+        # Test Method 1: Read with pq.read_table
+        logging.info(f"  Reading with pq.read_table...")
+        table_read = pq.read_table(filename, filesystem=s3)
+        if table_read.num_rows != num_rows:
+            return False, f"pq.read_table: Row count mismatch (expected {num_rows}, got {table_read.num_rows})"
+        logging.info(f"  ✓ pq.read_table: {table_read.num_rows:,} rows")
+        
+        # Test Method 2: Read with pq.ParquetDataset
+        logging.info(f"  Reading with pq.ParquetDataset...")
+        dataset = pq.ParquetDataset(filename, filesystem=s3)
+        table_dataset = dataset.read()
+        if table_dataset.num_rows != num_rows:
+            return False, f"pq.ParquetDataset: Row count mismatch (expected {num_rows}, got {table_dataset.num_rows})"
+        logging.info(f"  ✓ pq.ParquetDataset: {table_dataset.num_rows:,} rows")
+        
+        # Test Method 3: Read with pads.dataset
+        logging.info(f"  Reading with pads.dataset...")
+        dataset_pads = pads.dataset(filename, filesystem=s3)
+        table_pads = dataset_pads.to_table()
+        if table_pads.num_rows != num_rows:
+            return False, f"pads.dataset: Row count mismatch (expected {num_rows}, got {table_pads.num_rows})"
+        logging.info(f"  ✓ pads.dataset: {table_pads.num_rows:,} rows")
+        
+        return True, "All read methods passed"
+        
+    except Exception as e:
+        logging.exception(f"  ✗ Test failed: {e}")
+        return False, f"{type(e).__name__}: {str(e)}"
+
+
+def cleanup_test_files(s3: pafs.S3FileSystem) -> None:
+    """Clean up test files from S3."""
+    try:
+        test_path = f"{BUCKET_NAME}/{TEST_DIR}"
+        logging.info(f"Cleaning up test directory: {test_path}")
+        
+        # Delete all files in the test directory
+        file_info = s3.get_file_info(pafs.FileSelector(test_path, recursive=True))
+        for info in file_info:
+            if info.type == pafs.FileType.File:
+                s3.delete_file(info.path)
+        
+        logging.info("✓ Test directory cleaned up")
+    except Exception as e:
+        logging.warning(f"Failed to cleanup test directory: {e}")
+
+
+def main():
+    """Run all tests with PyArrow's native S3 filesystem."""
+    print("=" * 80)
+    print("PyArrow Native S3 Filesystem Tests for SeaweedFS")
+    print("Testing Parquet Files with Multiple Row Groups")
+    if TEST_QUICK:
+        print("*** QUICK TEST MODE - Small files only ***")
+    print("=" * 80 + "\n")
+
+    print("Configuration:")
+    print(f"  S3 Endpoint: {S3_ENDPOINT_URL}")
+    print(f"  Access Key: {S3_ACCESS_KEY}")
+    print(f"  Bucket: {BUCKET_NAME}")
+    print(f"  Test Directory: {TEST_DIR}")
+    print(f"  Quick Mode: {'Yes (small files only)' if TEST_QUICK else 'No (all file sizes)'}")
+    print(f"  PyArrow Version: {pa.__version__}")
+    print()
+
+    # Initialize S3 filesystem
+    s3, scheme, endpoint = init_s3_filesystem()
+    if s3 is None:
+        print("Cannot proceed without S3 connection")
+        return 1
+
+    # Ensure bucket exists - try PyArrow first, fall back to boto3
+    bucket_created = ensure_bucket_exists(s3)
+    if not bucket_created:
+        logging.info("Trying to create bucket with boto3...")
+        bucket_created = ensure_bucket_exists_boto3(scheme, endpoint)
+    
+    if not bucket_created:
+        print("Cannot proceed without bucket")
+        return 1
+
+    results = []
+
+    # Test all file sizes
+    for size_name, num_rows in TEST_SIZES.items():
+        print(f"\n{'='*80}")
+        print(f"Testing with {size_name} files ({num_rows:,} rows)")
+        print(f"{'='*80}\n")
+
+        test_name = f"{size_name}_test"
+        success, message = test_write_and_read(s3, test_name, num_rows)
+        results.append((test_name, success, message))
+        
+        status = "✓ PASS" if success else "✗ FAIL"
+        print(f"\n{status}: {message}\n")
+
+    # Summary
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    passed = sum(1 for _, success, _ in results if success)
+    total = len(results)
+    print(f"\nTotal: {passed}/{total} passed\n")
+
+    for test_name, success, message in results:
+        status = "✓" if success else "✗"
+        print(f"  {status} {test_name}: {message}")
+
+    print("\n" + "=" * 80)
+    if passed == total:
+        print("✓ ALL TESTS PASSED!")
+    else:
+        print(f"✗ {total - passed} test(s) failed")
+
+    print("=" * 80 + "\n")
+
+    # Cleanup
+    cleanup_test_files(s3)
+
+    return 0 if passed == total else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+