seaweedfs/test/s3/parquet/test_implicit_directory_fix.py


								#!/usr/bin/env python3

								"""

								Test script to verify the implicit directory fix for s3fs compatibility.


								This test verifies that:

								1. Implicit directory markers (0-byte objects with children) return 404 on HEAD

								2. s3fs correctly identifies them as directories via LIST fallback

								3. PyArrow can read datasets created with write_dataset()


								The fix makes SeaweedFS behave like AWS S3 and improves s3fs compatibility.

								"""


								import io

								import logging

								import os

								import sys

								import traceback


								import pyarrow as pa

								import pyarrow.dataset as pads

								import pyarrow.parquet as pq

								import s3fs

								import boto3

								from botocore.exceptions import ClientError


								# Configure logging

								logging.basicConfig(

								    level=logging.INFO,

								    format='%(asctime)s - %(levelname)s - %(message)s'

								)

								logger = logging.getLogger(__name__)


								# Configuration

								S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "http://localhost:8333")

								S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")

								S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")

								BUCKET_NAME = os.getenv("BUCKET_NAME", "test-implicit-dir")


								def create_sample_table(num_rows: int = 1000) -> pa.Table:

								    """Create a sample PyArrow table."""

								    return pa.table({

								        'id': pa.array(range(num_rows), type=pa.int64()),

								        'value': pa.array([f'value_{i}' for i in range(num_rows)], type=pa.string()),

								        'score': pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),

								    })


								def setup_s3():

								    """Set up S3 clients."""

								    # s3fs client

								    fs = s3fs.S3FileSystem(

								        key=S3_ACCESS_KEY,

								        secret=S3_SECRET_KEY,

								        client_kwargs={'endpoint_url': S3_ENDPOINT_URL},

								        use_ssl=False

								    )


								    # boto3 client for raw S3 operations

								    s3_client = boto3.client(

								        's3',

								        endpoint_url=S3_ENDPOINT_URL,

								        aws_access_key_id=S3_ACCESS_KEY,

								        aws_secret_access_key=S3_SECRET_KEY,

								        use_ssl=False

								    )


								    return fs, s3_client


								def test_implicit_directory_head_behavior(fs, s3_client):

								    """Test that HEAD on implicit directory markers returns 404."""

								    logger.info("\n" + "="*80)

								    logger.info("TEST 1: Implicit Directory HEAD Behavior")

								    logger.info("="*80)


								    test_path = f"{BUCKET_NAME}/test_implicit_dir"


								    # Clean up any existing data

								    try:

								        fs.rm(test_path, recursive=True)

								    except:

								        pass


								    # Create a dataset using PyArrow (creates implicit directory)

								    logger.info(f"Creating dataset at: {test_path}")

								    table = create_sample_table(1000)

								    pads.write_dataset(table, test_path, filesystem=fs, format='parquet')


								    # List what was created

								    logger.info("\nFiles created:")

								    files = fs.ls(test_path, detail=True)

								    for f in files:

								        logger.info(f"  {f['name']} - size: {f['size']} bytes, type: {f['type']}")


								    # Test HEAD request on the directory marker (without trailing slash)

								    logger.info(f"\nTesting HEAD on: {test_path}")

								    try:

								        response = s3_client.head_object(Bucket=BUCKET_NAME, Key='test_implicit_dir')

								        logger.info(f"  HEAD response: {response['ResponseMetadata']['HTTPStatusCode']}")

								        logger.info(f"  Content-Length: {response.get('ContentLength', 'N/A')}")

								        logger.info(f"  Content-Type: {response.get('ContentType', 'N/A')}")

								        logger.warning("  ⚠️  Expected 404, but got 200 - fix may not be working")

								        return False

								    except ClientError as e:

								        if e.response['Error']['Code'] == '404':

								            logger.info("  ✓ HEAD returned 404 (expected - implicit directory)")

								            return True

								        else:

								            logger.error(f"  ✗ Unexpected error: {e}")

								            return False


								def test_s3fs_directory_detection(fs):

								    """Test that s3fs correctly detects the directory."""

								    logger.info("\n" + "="*80)

								    logger.info("TEST 2: s3fs Directory Detection")

								    logger.info("="*80)


								    test_path = f"{BUCKET_NAME}/test_implicit_dir"


								    # Test s3fs.info()

								    logger.info(f"\nTesting s3fs.info('{test_path}'):")

								    try:

								        info = fs.info(test_path)

								        logger.info(f"  Type: {info.get('type', 'N/A')}")

								        logger.info(f"  Size: {info.get('size', 'N/A')}")


								        if info.get('type') == 'directory':

								            logger.info("  ✓ s3fs correctly identified as directory")

								            return True

								        else:

								            logger.warning(f"  ⚠️  s3fs identified as: {info.get('type')}")

								            return False

								    except Exception as e:

								        logger.error(f"  ✗ Error: {e}")

								        return False


								def test_s3fs_isdir(fs):

								    """Test that s3fs.isdir() works correctly."""

								    logger.info("\n" + "="*80)

								    logger.info("TEST 3: s3fs.isdir() Method")

								    logger.info("="*80)


								    test_path = f"{BUCKET_NAME}/test_implicit_dir"


								    logger.info(f"\nTesting s3fs.isdir('{test_path}'):")

								    try:

								        is_dir = fs.isdir(test_path)

								        logger.info(f"  Result: {is_dir}")


								        if is_dir:

								            logger.info("  ✓ s3fs.isdir() correctly returned True")

								            return True

								        else:

								            logger.warning("  ⚠️  s3fs.isdir() returned False")

								            return False

								    except Exception as e:

								        logger.error(f"  ✗ Error: {e}")

								        return False


								def test_pyarrow_dataset_read(fs):

								    """Test that PyArrow can read the dataset."""

								    logger.info("\n" + "="*80)

								    logger.info("TEST 4: PyArrow Dataset Read")

								    logger.info("="*80)


								    test_path = f"{BUCKET_NAME}/test_implicit_dir"


								    logger.info(f"\nReading dataset from: {test_path}")

								    try:

								        ds = pads.dataset(test_path, filesystem=fs, format='parquet')

								        table = ds.to_table()

								        logger.info(f"  ✓ Successfully read {len(table)} rows")

								        logger.info(f"  Columns: {table.column_names}")

								        return True

								    except Exception as e:

								        logger.error(f"  ✗ Failed to read dataset: {e}")

								        traceback.print_exc()

								        return False


								def test_explicit_directory_marker(fs, s3_client):

								    """Test that explicit directory markers (with trailing slash) still work."""

								    logger.info("\n" + "="*80)

								    logger.info("TEST 5: Explicit Directory Marker (with trailing slash)")

								    logger.info("="*80)


								    # Create an explicit directory marker

								    logger.info(f"\nCreating explicit directory: {BUCKET_NAME}/explicit_dir/")

								    try:

								        s3_client.put_object(

								            Bucket=BUCKET_NAME,

								            Key='explicit_dir/',

								            Body=b'',

								            ContentType='httpd/unix-directory'

								        )

								        logger.info("  ✓ Created explicit directory marker")

								    except Exception as e:

								        logger.error(f"  ✗ Failed to create: {e}")

								        return False


								    # Test HEAD with trailing slash

								    logger.info(f"\nTesting HEAD on: {BUCKET_NAME}/explicit_dir/")

								    try:

								        response = s3_client.head_object(Bucket=BUCKET_NAME, Key='explicit_dir/')

								        logger.info(f"  ✓ HEAD returned 200 (expected for explicit directory)")

								        logger.info(f"  Content-Type: {response.get('ContentType', 'N/A')}")

								        return True

								    except ClientError as e:

								        logger.error(f"  ✗ HEAD failed: {e}")

								        return False


								def test_empty_file_not_directory(fs, s3_client):

								    """Test that legitimate empty files are not treated as directories."""

								    logger.info("\n" + "="*80)

								    logger.info("TEST 6: Empty File (not a directory)")

								    logger.info("="*80)


								    # Create an empty file with text/plain mime type

								    logger.info(f"\nCreating empty file: {BUCKET_NAME}/empty.txt")

								    try:

								        s3_client.put_object(

								            Bucket=BUCKET_NAME,

								            Key='empty.txt',

								            Body=b'',

								            ContentType='text/plain'

								        )

								        logger.info("  ✓ Created empty file")

								    except Exception as e:

								        logger.error(f"  ✗ Failed to create: {e}")

								        return False


								    # Test HEAD

								    logger.info(f"\nTesting HEAD on: {BUCKET_NAME}/empty.txt")

								    try:

								        response = s3_client.head_object(Bucket=BUCKET_NAME, Key='empty.txt')

								        logger.info(f"  ✓ HEAD returned 200 (expected for empty file)")

								        logger.info(f"  Content-Type: {response.get('ContentType', 'N/A')}")


								        # Verify s3fs doesn't think it's a directory

								        info = fs.info(f"{BUCKET_NAME}/empty.txt")

								        if info.get('type') == 'file':

								            logger.info("  ✓ s3fs correctly identified as file")

								            return True

								        else:

								            logger.warning(f"  ⚠️  s3fs identified as: {info.get('type')}")

								            return False

								    except Exception as e:

								        logger.error(f"  ✗ Error: {e}")

								        return False


								def main():

								    """Run all tests."""

								    logger.info("="*80)

								    logger.info("Implicit Directory Fix Test Suite")

								    logger.info("="*80)

								    logger.info(f"Endpoint: {S3_ENDPOINT_URL}")

								    logger.info(f"Bucket: {BUCKET_NAME}")

								    logger.info("="*80)


								    # Set up S3 clients

								    fs, s3_client = setup_s3()


								    # Create bucket if it doesn't exist

								    try:

								        s3_client.create_bucket(Bucket=BUCKET_NAME)

								        logger.info(f"\n✓ Created bucket: {BUCKET_NAME}")

								    except ClientError as e:

								        error_code = e.response['Error']['Code']

								        if error_code in ['BucketAlreadyOwnedByYou', 'BucketAlreadyExists']:

								            logger.info(f"\n✓ Bucket already exists: {BUCKET_NAME}")

								        else:

								            logger.error(f"\n✗ Failed to create bucket: {e}")

								            return 1


								    # Run tests

								    results = []


								    results.append(("Implicit Directory HEAD", test_implicit_directory_head_behavior(fs, s3_client)))

								    results.append(("s3fs Directory Detection", test_s3fs_directory_detection(fs)))

								    results.append(("s3fs.isdir() Method", test_s3fs_isdir(fs)))

								    results.append(("PyArrow Dataset Read", test_pyarrow_dataset_read(fs)))

								    results.append(("Explicit Directory Marker", test_explicit_directory_marker(fs, s3_client)))

								    results.append(("Empty File Not Directory", test_empty_file_not_directory(fs, s3_client)))


								    # Print summary

								    logger.info("\n" + "="*80)

								    logger.info("TEST SUMMARY")

								    logger.info("="*80)


								    passed = sum(1 for _, result in results if result)

								    total = len(results)


								    for name, result in results:

								        status = "✓ PASS" if result else "✗ FAIL"

								        logger.info(f"{status}: {name}")


								    logger.info("="*80)

								    logger.info(f"Results: {passed}/{total} tests passed")

								    logger.info("="*80)


								    if passed == total:

								        logger.info("\n🎉 All tests passed! The implicit directory fix is working correctly.")

								        return 0

								    else:

								        logger.warning(f"\n⚠️  {total - passed} test(s) failed. The fix may not be fully working.")

								        return 1


								if __name__ == "__main__":

								    sys.exit(main())