From 1c650c646617c1da52082b8a3ea7ea823434fbfb Mon Sep 17 00:00:00 2001
From: chrislu <chris.lu@gmail.com>
Date: Wed, 19 Nov 2025 13:24:11 -0800
Subject: [PATCH] refactoring

---
 test/s3/parquet/example_pyarrow_native.py    | 14 +------
 test/s3/parquet/parquet_test_utils.py        | 41 ++++++++++++++++++++
 test/s3/parquet/test_pyarrow_native_s3.py    | 23 +++++------
 test/s3/parquet/test_sse_s3_compatibility.py | 14 +------
 4 files changed, 55 insertions(+), 37 deletions(-)
 create mode 100644 test/s3/parquet/parquet_test_utils.py

diff --git a/test/s3/parquet/example_pyarrow_native.py b/test/s3/parquet/example_pyarrow_native.py
index 73fda4a6a..785ce0b45 100755
--- a/test/s3/parquet/example_pyarrow_native.py
+++ b/test/s3/parquet/example_pyarrow_native.py
@@ -35,6 +35,8 @@ import pyarrow.dataset as pads
 import pyarrow.fs as pafs
 import pyarrow.parquet as pq
 
+from parquet_test_utils import create_sample_table
+
 # Configuration
 BUCKET_NAME = os.getenv("BUCKET_NAME", "test-parquet-bucket")
 S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "localhost:8333")
@@ -67,18 +69,6 @@ s3 = pafs.S3FileSystem(
 print("✓ Connected to S3 endpoint")
 
 
-def create_sample_table(num_rows: int = 5) -> pa.Table:
-    """Create a sample PyArrow table for testing."""
-    return pa.table(
-        {
-            "id": pa.array(range(num_rows), type=pa.int64()),
-            "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),
-            "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
-            "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),
-        }
-    )
-
-
 # Create bucket if needed (using boto3)
 try:
     import boto3
diff --git a/test/s3/parquet/parquet_test_utils.py b/test/s3/parquet/parquet_test_utils.py
new file mode 100644
index 000000000..d7e4c43db
--- /dev/null
+++ b/test/s3/parquet/parquet_test_utils.py
@@ -0,0 +1,41 @@
+"""
+Shared utility functions for PyArrow Parquet tests.
+
+This module provides common test utilities used across multiple test scripts
+to avoid code duplication and ensure consistency.
+"""
+
+import pyarrow as pa
+
+
+def create_sample_table(num_rows: int = 5) -> pa.Table:
+    """Create a sample PyArrow table for testing.
+    
+    Args:
+        num_rows: Number of rows to generate (default: 5)
+    
+    Returns:
+        PyArrow Table with test data containing:
+        - id: int64 sequential IDs (0 to num_rows-1)
+        - name: string user names (user_0, user_1, ...)
+        - value: float64 values (id * 1.5)
+        - flag: bool alternating True/False based on even/odd id
+    
+    Example:
+        >>> table = create_sample_table(3)
+        >>> print(table)
+        pyarrow.Table
+        id: int64
+        name: string
+        value: double
+        flag: bool
+    """
+    return pa.table(
+        {
+            "id": pa.array(range(num_rows), type=pa.int64()),
+            "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),
+            "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
+            "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),
+        }
+    )
+
diff --git a/test/s3/parquet/test_pyarrow_native_s3.py b/test/s3/parquet/test_pyarrow_native_s3.py
index ef3ea2d2e..6b5317c45 100755
--- a/test/s3/parquet/test_pyarrow_native_s3.py
+++ b/test/s3/parquet/test_pyarrow_native_s3.py
@@ -46,6 +46,8 @@ try:
 except ImportError:
     HAS_BOTO3 = False
 
+from parquet_test_utils import create_sample_table
+
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 
 # Configuration from environment variables with defaults
@@ -71,18 +73,6 @@ if TEST_QUICK:
     logging.info("Quick test mode enabled - running only small tests")
 
 
-def create_sample_table(num_rows: int = 5) -> pa.Table:
-    """Create a sample PyArrow table for testing."""
-    return pa.table(
-        {
-            "id": pa.array(range(num_rows), type=pa.int64()),
-            "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),
-            "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
-            "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),
-        }
-    )
-
-
 def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:
     """Initialize PyArrow's native S3 filesystem.
     
@@ -271,7 +261,14 @@ def test_write_and_read(s3: pafs.S3FileSystem, test_name: str, num_rows: int) ->
 
 
 def cleanup_test_files(s3: pafs.S3FileSystem) -> None:
-    """Clean up test files from S3."""
+    """Clean up test files from S3.
+    
+    Note: We cannot use s3.delete_dir() directly because SeaweedFS uses implicit
+    directories (path prefixes without physical directory objects). PyArrow's
+    delete_dir() attempts to delete the directory marker itself, which fails with
+    "INTERNAL_FAILURE" on SeaweedFS. Instead, we list and delete files individually,
+    letting implicit directories disappear automatically.
+    """
     try:
         test_path = f"{BUCKET_NAME}/{TEST_DIR}"
         logging.info(f"Cleaning up test directory: {test_path}")
diff --git a/test/s3/parquet/test_sse_s3_compatibility.py b/test/s3/parquet/test_sse_s3_compatibility.py
index 80a95bd6c..534a6f814 100755
--- a/test/s3/parquet/test_sse_s3_compatibility.py
+++ b/test/s3/parquet/test_sse_s3_compatibility.py
@@ -44,6 +44,8 @@ except ImportError:
     logging.exception("boto3 is required for this test")
     sys.exit(1)
 
+from parquet_test_utils import create_sample_table
+
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 
 # Configuration
@@ -66,18 +68,6 @@ TEST_SIZES = {
 }
 
 
-def create_sample_table(num_rows: int = 5) -> pa.Table:
-    """Create a sample PyArrow table for testing."""
-    return pa.table(
-        {
-            "id": pa.array(range(num_rows), type=pa.int64()),
-            "name": pa.array([f"user_{i}" for i in range(num_rows)], type=pa.string()),
-            "value": pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
-            "flag": pa.array([i % 2 == 0 for i in range(num_rows)], type=pa.bool_()),
-        }
-    )
-
-
 def init_s3_filesystem() -> tuple[Optional[pafs.S3FileSystem], str, str]:
     """Initialize PyArrow's native S3 filesystem."""
     try: