5 changed files with 589 additions and 2 deletions
-
11.github/workflows/s3-parquet-tests.yml
-
36test/s3/parquet/Makefile
-
75test/s3/parquet/README.md
-
142test/s3/parquet/example_pyarrow_native.py
-
327test/s3/parquet/test_pyarrow_native_s3.py
@ -0,0 +1,142 @@ |
|||
#!/usr/bin/env python3 |
|||
# /// script |
|||
# dependencies = [ |
|||
# "pyarrow>=22", |
|||
# "boto3>=1.28.0", |
|||
# ] |
|||
# /// |
|||
|
|||
""" |
|||
Simple example of using PyArrow's native S3 filesystem with SeaweedFS. |
|||
|
|||
This is a minimal example demonstrating how to write and read Parquet files |
|||
using PyArrow's built-in S3FileSystem without any additional dependencies |
|||
like s3fs. |
|||
|
|||
Usage: |
|||
# Set environment variables |
|||
export S3_ENDPOINT_URL=localhost:8333 |
|||
export S3_ACCESS_KEY=some_access_key1 |
|||
export S3_SECRET_KEY=some_secret_key1 |
|||
export BUCKET_NAME=test-parquet-bucket |
|||
|
|||
# Run the script |
|||
python3 example_pyarrow_native.py |
|||
|
|||
# Or run with uv (if available) |
|||
uv run example_pyarrow_native.py |
|||
""" |
|||
|
|||
import os |
|||
import secrets |
|||
|
|||
import pyarrow as pa |
|||
import pyarrow.dataset as pads |
|||
import pyarrow.fs as pafs |
|||
import pyarrow.parquet as pq |
|||
|
|||
# Configuration |
|||
BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket") |
|||
S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "localhost:8333") |
|||
S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY", "some_access_key1") |
|||
S3_SECRET_KEY = os.getenv("S3_SECRET_KEY", "some_secret_key1") |
|||
|
|||
# Determine scheme from endpoint |
|||
if S3_ENDPOINT_URL.startswith("http://"): |
|||
scheme = "http" |
|||
endpoint = S3_ENDPOINT_URL[7:] |
|||
elif S3_ENDPOINT_URL.startswith("https://"): |
|||
scheme = "https" |
|||
endpoint = S3_ENDPOINT_URL[8:] |
|||
else: |
|||
scheme = "http" # Default to http for localhost |
|||
endpoint = S3_ENDPOINT_URL |
|||
|
|||
print(f"Connecting to S3 endpoint: {scheme}://{endpoint}") |
|||
|
|||
# Initialize PyArrow's NATIVE S3 filesystem |
|||
s3 = pafs.S3FileSystem( |
|||
access_key=S3_ACCESS_KEY, |
|||
secret_key=S3_SECRET_KEY, |
|||
endpoint_override=endpoint, |
|||
scheme=scheme, |
|||
allow_bucket_creation=True, |
|||
allow_bucket_deletion=True, |
|||
) |
|||
|
|||
print(f"✓ Connected to S3 endpoint") |
|||
|
|||
|
|||
def create_sample_table(num_rows: int = 5) -> pa.Table: |
|||
"""Create a sample PyArrow table for testing.""" |
|||
return pa.table( |
|||
{ |
|||
"id": pa.array(range(num_rows), type=pa.int64()), |
|||
"name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()), |
|||
"value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()), |
|||
"flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()), |
|||
} |
|||
) |
|||
|
|||
|
|||
# Create bucket if needed (using boto3) |
|||
try: |
|||
import boto3 |
|||
from botocore.exceptions import ClientError |
|||
|
|||
s3_client = boto3.client( |
|||
's3', |
|||
endpoint_url=f"{scheme}://{endpoint}", |
|||
aws_access_key_id=S3_ACCESS_KEY, |
|||
aws_secret_access_key=S3_SECRET_KEY, |
|||
region_name='us-east-1', |
|||
) |
|||
|
|||
try: |
|||
s3_client.head_bucket(Bucket=BUCKET_NAME) |
|||
print(f"✓ Bucket exists: {BUCKET_NAME}") |
|||
except ClientError as e: |
|||
if e.response['Error']['Code'] == '404': |
|||
print(f"Creating bucket: {BUCKET_NAME}") |
|||
s3_client.create_bucket(Bucket=BUCKET_NAME) |
|||
print(f"✓ Bucket created: {BUCKET_NAME}") |
|||
except ImportError: |
|||
print("Warning: boto3 not available, assuming bucket exists") |
|||
|
|||
# Generate a unique filename |
|||
filename = f"{BUCKET_NAME}/dataset-{secrets.token_hex(8)}/test.parquet" |
|||
|
|||
print(f"\nWriting Parquet dataset to: {filename}") |
|||
|
|||
# Write dataset |
|||
table = create_sample_table(200_000) |
|||
pads.write_dataset( |
|||
table, |
|||
filename, |
|||
filesystem=s3, |
|||
format="parquet", |
|||
) |
|||
|
|||
print(f"✓ Wrote {table.num_rows:,} rows") |
|||
|
|||
# Read with pq.read_table |
|||
print(f"\nReading with pq.read_table...") |
|||
table_read = pq.read_table(filename, filesystem=s3) |
|||
print(f"✓ Read {table_read.num_rows:,} rows") |
|||
|
|||
# Read with pq.ParquetDataset |
|||
print(f"\nReading with pq.ParquetDataset...") |
|||
dataset = pq.ParquetDataset(filename, filesystem=s3) |
|||
table_dataset = dataset.read() |
|||
print(f"✓ Read {table_dataset.num_rows:,} rows") |
|||
|
|||
# Read with pads.dataset |
|||
print(f"\nReading with pads.dataset...") |
|||
dataset_pads = pads.dataset(filename, filesystem=s3) |
|||
table_pads = dataset_pads.to_table() |
|||
print(f"✓ Read {table_pads.num_rows:,} rows") |
|||
|
|||
print("\n✅ All operations completed successfully!") |
|||
print(f"\nFile written to: {filename}") |
|||
print("You can verify the file using the SeaweedFS S3 API or weed shell") |
|||
|
|||
@ -0,0 +1,327 @@ |
|||
#!/usr/bin/env python3 |
|||
""" |
|||
Test script for PyArrow's NATIVE S3 filesystem with SeaweedFS. |
|||
|
|||
This test uses PyArrow's built-in S3FileSystem (pyarrow.fs.S3FileSystem) |
|||
instead of s3fs, providing a pure PyArrow solution for reading and writing |
|||
Parquet files to S3-compatible storage. |
|||
|
|||
Requirements: |
|||
- pyarrow>=10.0.0 |
|||
|
|||
Environment Variables: |
|||
S3_ENDPOINT_URL: S3 endpoint (default: localhost:8333) |
|||
S3_ACCESS_KEY: S3 access key (default: some_access_key1) |
|||
S3_SECRET_KEY: S3 secret key (default: some_secret_key1) |
|||
BUCKET_NAME: S3 bucket name (default: test-parquet-bucket) |
|||
TEST_QUICK: Run only small/quick tests (default: 0, set to 1 for quick mode) |
|||
|
|||
Usage: |
|||
# Run with default environment variables |
|||
python3 test_pyarrow_native_s3.py |
|||
|
|||
# Run with custom environment variables |
|||
S3_ENDPOINT_URL=localhost:8333 \ |
|||
S3_ACCESS_KEY=mykey \ |
|||
S3_SECRET_KEY=mysecret \ |
|||
BUCKET_NAME=mybucket \ |
|||
python3 test_pyarrow_native_s3.py |
|||
""" |
|||
|
|||
import os |
|||
import secrets |
|||
import sys |
|||
import logging |
|||
from datetime import datetime |
|||
from typing import Optional |
|||
|
|||
import pyarrow as pa |
|||
import pyarrow.dataset as pads |
|||
import pyarrow.fs as pafs |
|||
import pyarrow.parquet as pq |
|||
|
|||
try: |
|||
import boto3 |
|||
from botocore.exceptions import ClientError |
|||
HAS_BOTO3 = True |
|||
except ImportError: |
|||
HAS_BOTO3 = False |
|||
|
|||
logging.basicConfig(level=logging.INFO, format="%(message)s") |
|||
|
|||
# Configuration from environment variables with defaults |
|||
S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "localhost:8333") |
|||
S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1") |
|||
S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1") |
|||
BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket") |
|||
TEST_QUICK = os.getenv("TEST_QUICK", "0") == "1" |
|||
|
|||
# Create randomized test directory |
|||
TEST_RUN_ID = secrets.token_hex(8) |
|||
TEST_DIR = f"parquet-native-tests/{TEST_RUN_ID}" |
|||
|
|||
# Test file sizes |
|||
TEST_SIZES = { |
|||
"small": 5, |
|||
"large": 200_000, # This will create multiple row groups |
|||
} |
|||
|
|||
# Filter to only small tests if quick mode is enabled |
|||
if TEST_QUICK: |
|||
TEST_SIZES = {"small": TEST_SIZES["small"]} |
|||
logging.info("Quick test mode enabled - running only small tests") |
|||
|
|||
|
|||
def create_sample_table(num_rows: int = 5) -> pa.Table: |
|||
"""Create a sample PyArrow table for testing.""" |
|||
return pa.table( |
|||
{ |
|||
"id": pa.array(range(num_rows), type=pa.int64()), |
|||
"name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()), |
|||
"value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()), |
|||
"flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()), |
|||
} |
|||
) |
|||
|
|||
|
|||
def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]: |
|||
"""Initialize PyArrow's native S3 filesystem. |
|||
|
|||
Returns: |
|||
tuple: (S3FileSystem instance, scheme, endpoint) |
|||
""" |
|||
try: |
|||
logging.info("Initializing PyArrow S3FileSystem...") |
|||
logging.info(f" Endpoint: {S3_ENDPOINT_URL}") |
|||
logging.info(f" Bucket: {BUCKET_NAME}") |
|||
|
|||
# Determine scheme from endpoint |
|||
if S3_ENDPOINT_URL.startswith("http://"): |
|||
scheme = "http" |
|||
endpoint = S3_ENDPOINT_URL[7:] # Remove http:// |
|||
elif S3_ENDPOINT_URL.startswith("https://"): |
|||
scheme = "https" |
|||
endpoint = S3_ENDPOINT_URL[8:] # Remove https:// |
|||
else: |
|||
# Default to http for localhost |
|||
scheme = "http" |
|||
endpoint = S3_ENDPOINT_URL |
|||
|
|||
# Enable bucket creation and deletion for testing |
|||
s3 = pafs.S3FileSystem( |
|||
access_key=S3_ACCESS_KEY, |
|||
secret_key=S3_SECRET_KEY, |
|||
endpoint_override=endpoint, |
|||
scheme=scheme, |
|||
allow_bucket_creation=True, |
|||
allow_bucket_deletion=True, |
|||
) |
|||
|
|||
logging.info("✓ PyArrow S3FileSystem initialized successfully\n") |
|||
return s3, scheme, endpoint |
|||
except Exception as e: |
|||
logging.exception("✗ Failed to initialize PyArrow S3FileSystem") |
|||
return None, "", "" |
|||
|
|||
|
|||
def ensure_bucket_exists_boto3(scheme: str, endpoint: str) -> bool: |
|||
"""Ensure the test bucket exists using boto3.""" |
|||
if not HAS_BOTO3: |
|||
logging.error("boto3 is required for bucket creation") |
|||
return False |
|||
|
|||
try: |
|||
# Create boto3 client |
|||
endpoint_url = f"{scheme}://{endpoint}" |
|||
s3_client = boto3.client( |
|||
's3', |
|||
endpoint_url=endpoint_url, |
|||
aws_access_key_id=S3_ACCESS_KEY, |
|||
aws_secret_access_key=S3_SECRET_KEY, |
|||
region_name='us-east-1', |
|||
) |
|||
|
|||
# Check if bucket exists |
|||
try: |
|||
s3_client.head_bucket(Bucket=BUCKET_NAME) |
|||
logging.info(f"✓ Bucket exists: {BUCKET_NAME}") |
|||
return True |
|||
except ClientError as e: |
|||
error_code = e.response['Error']['Code'] |
|||
if error_code == '404': |
|||
# Bucket doesn't exist, create it |
|||
logging.info(f"Creating bucket: {BUCKET_NAME}") |
|||
s3_client.create_bucket(Bucket=BUCKET_NAME) |
|||
logging.info(f"✓ Bucket created: {BUCKET_NAME}") |
|||
return True |
|||
else: |
|||
raise |
|||
except Exception as e: |
|||
logging.exception(f"✗ Failed to create/check bucket: {e}") |
|||
return False |
|||
|
|||
|
|||
def ensure_bucket_exists(s3: pafs.S3FileSystem) -> bool: |
|||
"""Ensure the test bucket exists using PyArrow's native S3FileSystem.""" |
|||
try: |
|||
# Check if bucket exists by trying to list it |
|||
try: |
|||
file_info = s3.get_file_info(BUCKET_NAME) |
|||
if file_info.type == pafs.FileType.Directory: |
|||
logging.info(f"✓ Bucket exists: {BUCKET_NAME}") |
|||
return True |
|||
except Exception: |
|||
pass |
|||
|
|||
# Try to create the bucket |
|||
logging.info(f"Creating bucket: {BUCKET_NAME}") |
|||
s3.create_dir(BUCKET_NAME) |
|||
logging.info(f"✓ Bucket created: {BUCKET_NAME}") |
|||
return True |
|||
except Exception as e: |
|||
logging.error(f"✗ Failed to create/check bucket with PyArrow: {e}") |
|||
return False |
|||
|
|||
|
|||
def test_write_and_read(s3: pafs.S3FileSystem, test_name: str, num_rows: int) -> tuple[bool, str]: |
|||
"""Test writing and reading a Parquet dataset using PyArrow's native S3 filesystem.""" |
|||
try: |
|||
table = create_sample_table(num_rows) |
|||
|
|||
# Write using pads.write_dataset |
|||
filename = f"{BUCKET_NAME}/{TEST_DIR}/{test_name}/data.parquet" |
|||
logging.info(f" Writing {num_rows:,} rows to {filename}...") |
|||
|
|||
pads.write_dataset( |
|||
table, |
|||
filename, |
|||
filesystem=s3, |
|||
format="parquet", |
|||
) |
|||
logging.info(f" ✓ Write completed") |
|||
|
|||
# Test Method 1: Read with pq.read_table |
|||
logging.info(f" Reading with pq.read_table...") |
|||
table_read = pq.read_table(filename, filesystem=s3) |
|||
if table_read.num_rows != num_rows: |
|||
return False, f"pq.read_table: Row count mismatch (expected {num_rows}, got {table_read.num_rows})" |
|||
logging.info(f" ✓ pq.read_table: {table_read.num_rows:,} rows") |
|||
|
|||
# Test Method 2: Read with pq.ParquetDataset |
|||
logging.info(f" Reading with pq.ParquetDataset...") |
|||
dataset = pq.ParquetDataset(filename, filesystem=s3) |
|||
table_dataset = dataset.read() |
|||
if table_dataset.num_rows != num_rows: |
|||
return False, f"pq.ParquetDataset: Row count mismatch (expected {num_rows}, got {table_dataset.num_rows})" |
|||
logging.info(f" ✓ pq.ParquetDataset: {table_dataset.num_rows:,} rows") |
|||
|
|||
# Test Method 3: Read with pads.dataset |
|||
logging.info(f" Reading with pads.dataset...") |
|||
dataset_pads = pads.dataset(filename, filesystem=s3) |
|||
table_pads = dataset_pads.to_table() |
|||
if table_pads.num_rows != num_rows: |
|||
return False, f"pads.dataset: Row count mismatch (expected {num_rows}, got {table_pads.num_rows})" |
|||
logging.info(f" ✓ pads.dataset: {table_pads.num_rows:,} rows") |
|||
|
|||
return True, "All read methods passed" |
|||
|
|||
except Exception as e: |
|||
logging.exception(f" ✗ Test failed: {e}") |
|||
return False, f"{type(e).__name__}: {str(e)}" |
|||
|
|||
|
|||
def cleanup_test_files(s3: pafs.S3FileSystem) -> None: |
|||
"""Clean up test files from S3.""" |
|||
try: |
|||
test_path = f"{BUCKET_NAME}/{TEST_DIR}" |
|||
logging.info(f"Cleaning up test directory: {test_path}") |
|||
|
|||
# Delete all files in the test directory |
|||
file_info = s3.get_file_info(pafs.FileSelector(test_path, recursive=True)) |
|||
for info in file_info: |
|||
if info.type == pafs.FileType.File: |
|||
s3.delete_file(info.path) |
|||
|
|||
logging.info("✓ Test directory cleaned up") |
|||
except Exception as e: |
|||
logging.warning(f"Failed to cleanup test directory: {e}") |
|||
|
|||
|
|||
def main(): |
|||
"""Run all tests with PyArrow's native S3 filesystem.""" |
|||
print("=" * 80) |
|||
print("PyArrow Native S3 Filesystem Tests for SeaweedFS") |
|||
print("Testing Parquet Files with Multiple Row Groups") |
|||
if TEST_QUICK: |
|||
print("*** QUICK TEST MODE - Small files only ***") |
|||
print("=" * 80 + "\n") |
|||
|
|||
print("Configuration:") |
|||
print(f" S3 Endpoint: {S3_ENDPOINT_URL}") |
|||
print(f" Access Key: {S3_ACCESS_KEY}") |
|||
print(f" Bucket: {BUCKET_NAME}") |
|||
print(f" Test Directory: {TEST_DIR}") |
|||
print(f" Quick Mode: {'Yes (small files only)' if TEST_QUICK else 'No (all file sizes)'}") |
|||
print(f" PyArrow Version: {pa.__version__}") |
|||
print() |
|||
|
|||
# Initialize S3 filesystem |
|||
s3, scheme, endpoint = init_s3_filesystem() |
|||
if s3 is None: |
|||
print("Cannot proceed without S3 connection") |
|||
return 1 |
|||
|
|||
# Ensure bucket exists - try PyArrow first, fall back to boto3 |
|||
bucket_created = ensure_bucket_exists(s3) |
|||
if not bucket_created: |
|||
logging.info("Trying to create bucket with boto3...") |
|||
bucket_created = ensure_bucket_exists_boto3(scheme, endpoint) |
|||
|
|||
if not bucket_created: |
|||
print("Cannot proceed without bucket") |
|||
return 1 |
|||
|
|||
results = [] |
|||
|
|||
# Test all file sizes |
|||
for size_name, num_rows in TEST_SIZES.items(): |
|||
print(f"\n{'='*80}") |
|||
print(f"Testing with {size_name} files ({num_rows:,} rows)") |
|||
print(f"{'='*80}\n") |
|||
|
|||
test_name = f"{size_name}_test" |
|||
success, message = test_write_and_read(s3, test_name, num_rows) |
|||
results.append((test_name, success, message)) |
|||
|
|||
status = "✓ PASS" if success else "✗ FAIL" |
|||
print(f"\n{status}: {message}\n") |
|||
|
|||
# Summary |
|||
print("\n" + "=" * 80) |
|||
print("SUMMARY") |
|||
print("=" * 80) |
|||
passed = sum(1 for _, success, _ in results if success) |
|||
total = len(results) |
|||
print(f"\nTotal: {passed}/{total} passed\n") |
|||
|
|||
for test_name, success, message in results: |
|||
status = "✓" if success else "✗" |
|||
print(f" {status} {test_name}: {message}") |
|||
|
|||
print("\n" + "=" * 80) |
|||
if passed == total: |
|||
print("✓ ALL TESTS PASSED!") |
|||
else: |
|||
print(f"✗ {total - passed} test(s) failed") |
|||
|
|||
print("=" * 80 + "\n") |
|||
|
|||
# Cleanup |
|||
cleanup_test_files(s3) |
|||
|
|||
return 0 if passed == total else 1 |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
sys.exit(main()) |
|||
|
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue