From 1c650c646617c1da52082b8a3ea7ea823434fbfb Mon Sep 17 00:00:00 2001 From: chrislu Date: Wed, 19 Nov 2025 13:24:11 -0800 Subject: [PATCH] refactoring --- test/s3/parquet/example_pyarrow_native.py | 14 +------ test/s3/parquet/parquet_test_utils.py | 41 ++++++++++++++++++++ test/s3/parquet/test_pyarrow_native_s3.py | 23 +++++------ test/s3/parquet/test_sse_s3_compatibility.py | 14 +------ 4 files changed, 55 insertions(+), 37 deletions(-) create mode 100644 test/s3/parquet/parquet_test_utils.py diff --git a/test/s3/parquet/example_pyarrow_native.py b/test/s3/parquet/example_pyarrow_native.py index 73fda4a6a..785ce0b45 100755 --- a/test/s3/parquet/example_pyarrow_native.py +++ b/test/s3/parquet/example_pyarrow_native.py @@ -35,6 +35,8 @@ import pyarrow.dataset as pads import pyarrow.fs as pafs import pyarrow.parquet as pq +from parquet_test_utils import create_sample_table + # Configuration BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket") S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "localhost:8333") @@ -67,18 +69,6 @@ s3 = pafs.S3FileSystem( print("✓ Connected to S3 endpoint") -def create_sample_table(num_rows: int = 5) -> pa.Table: - """Create a sample PyArrow table for testing.""" - return pa.table( - { - "id": pa.array(range(num_rows), type=pa.int64()), - "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()), - "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()), - "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()), - } - ) - - # Create bucket if needed (using boto3) try: import boto3 diff --git a/test/s3/parquet/parquet_test_utils.py b/test/s3/parquet/parquet_test_utils.py new file mode 100644 index 000000000..d7e4c43db --- /dev/null +++ b/test/s3/parquet/parquet_test_utils.py @@ -0,0 +1,41 @@ +""" +Shared utility functions for PyArrow Parquet tests. + +This module provides common test utilities used across multiple test scripts +to avoid code duplication and ensure consistency. +""" + +import pyarrow as pa + + +def create_sample_table(num_rows: int = 5) -> pa.Table: + """Create a sample PyArrow table for testing. + + Args: + num_rows: Number of rows to generate (default: 5) + + Returns: + PyArrow Table with test data containing: + - id: int64 sequential IDs (0 to num_rows-1) + - name: string user names (user_0, user_1, ...) + - value: float64 values (id * 1.5) + - flag: bool alternating True/False based on even/odd id + + Example: + >>> table = create_sample_table(3) + >>> print(table) + pyarrow.Table + id: int64 + name: string + value: double + flag: bool + """ + return pa.table( + { + "id": pa.array(range(num_rows), type=pa.int64()), + "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()), + "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()), + "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()), + } + ) + diff --git a/test/s3/parquet/test_pyarrow_native_s3.py b/test/s3/parquet/test_pyarrow_native_s3.py index ef3ea2d2e..6b5317c45 100755 --- a/test/s3/parquet/test_pyarrow_native_s3.py +++ b/test/s3/parquet/test_pyarrow_native_s3.py @@ -46,6 +46,8 @@ try: except ImportError: HAS_BOTO3 = False +from parquet_test_utils import create_sample_table + logging.basicConfig(level=logging.INFO, format="%(message)s") # Configuration from environment variables with defaults @@ -71,18 +73,6 @@ if TEST_QUICK: logging.info("Quick test mode enabled - running only small tests") -def create_sample_table(num_rows: int = 5) -> pa.Table: - """Create a sample PyArrow table for testing.""" - return pa.table( - { - "id": pa.array(range(num_rows), type=pa.int64()), - "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()), - "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()), - "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()), - } - ) - - def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]: """Initialize PyArrow's native S3 filesystem. @@ -271,7 +261,14 @@ def test_write_and_read(s3: pafs.S3FileSystem, test_name: str, num_rows: int) -> def cleanup_test_files(s3: pafs.S3FileSystem) -> None: - """Clean up test files from S3.""" + """Clean up test files from S3. + + Note: We cannot use s3.delete_dir() directly because SeaweedFS uses implicit + directories (path prefixes without physical directory objects). PyArrow's + delete_dir() attempts to delete the directory marker itself, which fails with + "INTERNAL_FAILURE" on SeaweedFS. Instead, we list and delete files individually, + letting implicit directories disappear automatically. + """ try: test_path = f"{BUCKET_NAME}/{TEST_DIR}" logging.info(f"Cleaning up test directory: {test_path}") diff --git a/test/s3/parquet/test_sse_s3_compatibility.py b/test/s3/parquet/test_sse_s3_compatibility.py index 80a95bd6c..534a6f814 100755 --- a/test/s3/parquet/test_sse_s3_compatibility.py +++ b/test/s3/parquet/test_sse_s3_compatibility.py @@ -44,6 +44,8 @@ except ImportError: logging.exception("boto3 is required for this test") sys.exit(1) +from parquet_test_utils import create_sample_table + logging.basicConfig(level=logging.INFO, format="%(message)s") # Configuration @@ -66,18 +68,6 @@ TEST_SIZES = { } -def create_sample_table(num_rows: int = 5) -> pa.Table: - """Create a sample PyArrow table for testing.""" - return pa.table( - { - "id": pa.array(range(num_rows), type=pa.int64()), - "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()), - "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()), - "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()), - } - ) - - def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]: """Initialize PyArrow's native S3 filesystem.""" try: