You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
307 lines
10 KiB
307 lines
10 KiB
#!/usr/bin/env python3
|
|
"""
|
|
Test script to verify the implicit directory fix for s3fs compatibility.
|
|
|
|
This test verifies that:
|
|
1. Implicit directory markers (0-byte objects with children) return 404 on HEAD
|
|
2. s3fs correctly identifies them as directories via LIST fallback
|
|
3. PyArrow can read datasets created with write_dataset()
|
|
|
|
The fix makes SeaweedFS behave like AWS S3 and improves s3fs compatibility.
|
|
"""
|
|
|
|
import io
|
|
import logging
|
|
import os
|
|
import sys
|
|
import traceback
|
|
|
|
import pyarrow as pa
|
|
import pyarrow.dataset as pads
|
|
import pyarrow.parquet as pq
|
|
import s3fs
|
|
import boto3
|
|
from botocore.exceptions import ClientError
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
S3_ENDPOINT_URL = os.environ.get("S3_ENDPOINT_URL", "http://localhost:8333")
|
|
S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "some_access_key1")
|
|
S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "some_secret_key1")
|
|
BUCKET_NAME = os.getenv("BUCKET_NAME", "test-implicit-dir")
|
|
|
|
def create_sample_table(num_rows: int = 1000) -> pa.Table:
|
|
"""Create a sample PyArrow table."""
|
|
return pa.table({
|
|
'id': pa.array(range(num_rows), type=pa.int64()),
|
|
'value': pa.array([f'value_{i}' for i in range(num_rows)], type=pa.string()),
|
|
'score': pa.array([float(i) * 1.5 for i in range(num_rows)], type=pa.float64()),
|
|
})
|
|
|
|
def setup_s3():
|
|
"""Set up S3 clients."""
|
|
# s3fs client
|
|
fs = s3fs.S3FileSystem(
|
|
key=S3_ACCESS_KEY,
|
|
secret=S3_SECRET_KEY,
|
|
client_kwargs={'endpoint_url': S3_ENDPOINT_URL},
|
|
use_ssl=False
|
|
)
|
|
|
|
# boto3 client for raw S3 operations
|
|
s3_client = boto3.client(
|
|
's3',
|
|
endpoint_url=S3_ENDPOINT_URL,
|
|
aws_access_key_id=S3_ACCESS_KEY,
|
|
aws_secret_access_key=S3_SECRET_KEY,
|
|
use_ssl=False
|
|
)
|
|
|
|
return fs, s3_client
|
|
|
|
def test_implicit_directory_head_behavior(fs, s3_client):
|
|
"""Test that HEAD on implicit directory markers returns 404."""
|
|
logger.info("\n" + "="*80)
|
|
logger.info("TEST 1: Implicit Directory HEAD Behavior")
|
|
logger.info("="*80)
|
|
|
|
test_path = f"{BUCKET_NAME}/test_implicit_dir"
|
|
|
|
# Clean up any existing data
|
|
try:
|
|
fs.rm(test_path, recursive=True)
|
|
except:
|
|
pass
|
|
|
|
# Create a dataset using PyArrow (creates implicit directory)
|
|
logger.info(f"Creating dataset at: {test_path}")
|
|
table = create_sample_table(1000)
|
|
pads.write_dataset(table, test_path, filesystem=fs, format='parquet')
|
|
|
|
# List what was created
|
|
logger.info("\nFiles created:")
|
|
files = fs.ls(test_path, detail=True)
|
|
for f in files:
|
|
logger.info(f" {f['name']} - size: {f['size']} bytes, type: {f['type']}")
|
|
|
|
# Test HEAD request on the directory marker (without trailing slash)
|
|
logger.info(f"\nTesting HEAD on: {test_path}")
|
|
try:
|
|
response = s3_client.head_object(Bucket=BUCKET_NAME, Key='test_implicit_dir')
|
|
logger.info(f" HEAD response: {response['ResponseMetadata']['HTTPStatusCode']}")
|
|
logger.info(f" Content-Length: {response.get('ContentLength', 'N/A')}")
|
|
logger.info(f" Content-Type: {response.get('ContentType', 'N/A')}")
|
|
logger.warning(" ⚠️ Expected 404, but got 200 - fix may not be working")
|
|
return False
|
|
except ClientError as e:
|
|
if e.response['Error']['Code'] == '404':
|
|
logger.info(" ✓ HEAD returned 404 (expected - implicit directory)")
|
|
return True
|
|
else:
|
|
logger.error(f" ✗ Unexpected error: {e}")
|
|
return False
|
|
|
|
def test_s3fs_directory_detection(fs):
|
|
"""Test that s3fs correctly detects the directory."""
|
|
logger.info("\n" + "="*80)
|
|
logger.info("TEST 2: s3fs Directory Detection")
|
|
logger.info("="*80)
|
|
|
|
test_path = f"{BUCKET_NAME}/test_implicit_dir"
|
|
|
|
# Test s3fs.info()
|
|
logger.info(f"\nTesting s3fs.info('{test_path}'):")
|
|
try:
|
|
info = fs.info(test_path)
|
|
logger.info(f" Type: {info.get('type', 'N/A')}")
|
|
logger.info(f" Size: {info.get('size', 'N/A')}")
|
|
|
|
if info.get('type') == 'directory':
|
|
logger.info(" ✓ s3fs correctly identified as directory")
|
|
return True
|
|
else:
|
|
logger.warning(f" ⚠️ s3fs identified as: {info.get('type')}")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f" ✗ Error: {e}")
|
|
return False
|
|
|
|
def test_s3fs_isdir(fs):
|
|
"""Test that s3fs.isdir() works correctly."""
|
|
logger.info("\n" + "="*80)
|
|
logger.info("TEST 3: s3fs.isdir() Method")
|
|
logger.info("="*80)
|
|
|
|
test_path = f"{BUCKET_NAME}/test_implicit_dir"
|
|
|
|
logger.info(f"\nTesting s3fs.isdir('{test_path}'):")
|
|
try:
|
|
is_dir = fs.isdir(test_path)
|
|
logger.info(f" Result: {is_dir}")
|
|
|
|
if is_dir:
|
|
logger.info(" ✓ s3fs.isdir() correctly returned True")
|
|
return True
|
|
else:
|
|
logger.warning(" ⚠️ s3fs.isdir() returned False")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f" ✗ Error: {e}")
|
|
return False
|
|
|
|
def test_pyarrow_dataset_read(fs):
|
|
"""Test that PyArrow can read the dataset."""
|
|
logger.info("\n" + "="*80)
|
|
logger.info("TEST 4: PyArrow Dataset Read")
|
|
logger.info("="*80)
|
|
|
|
test_path = f"{BUCKET_NAME}/test_implicit_dir"
|
|
|
|
logger.info(f"\nReading dataset from: {test_path}")
|
|
try:
|
|
ds = pads.dataset(test_path, filesystem=fs, format='parquet')
|
|
table = ds.to_table()
|
|
logger.info(f" ✓ Successfully read {len(table)} rows")
|
|
logger.info(f" Columns: {table.column_names}")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f" ✗ Failed to read dataset: {e}")
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
def test_explicit_directory_marker(fs, s3_client):
|
|
"""Test that explicit directory markers (with trailing slash) still work."""
|
|
logger.info("\n" + "="*80)
|
|
logger.info("TEST 5: Explicit Directory Marker (with trailing slash)")
|
|
logger.info("="*80)
|
|
|
|
# Create an explicit directory marker
|
|
logger.info(f"\nCreating explicit directory: {BUCKET_NAME}/explicit_dir/")
|
|
try:
|
|
s3_client.put_object(
|
|
Bucket=BUCKET_NAME,
|
|
Key='explicit_dir/',
|
|
Body=b'',
|
|
ContentType='httpd/unix-directory'
|
|
)
|
|
logger.info(" ✓ Created explicit directory marker")
|
|
except Exception as e:
|
|
logger.error(f" ✗ Failed to create: {e}")
|
|
return False
|
|
|
|
# Test HEAD with trailing slash
|
|
logger.info(f"\nTesting HEAD on: {BUCKET_NAME}/explicit_dir/")
|
|
try:
|
|
response = s3_client.head_object(Bucket=BUCKET_NAME, Key='explicit_dir/')
|
|
logger.info(f" ✓ HEAD returned 200 (expected for explicit directory)")
|
|
logger.info(f" Content-Type: {response.get('ContentType', 'N/A')}")
|
|
return True
|
|
except ClientError as e:
|
|
logger.error(f" ✗ HEAD failed: {e}")
|
|
return False
|
|
|
|
def test_empty_file_not_directory(fs, s3_client):
|
|
"""Test that legitimate empty files are not treated as directories."""
|
|
logger.info("\n" + "="*80)
|
|
logger.info("TEST 6: Empty File (not a directory)")
|
|
logger.info("="*80)
|
|
|
|
# Create an empty file with text/plain mime type
|
|
logger.info(f"\nCreating empty file: {BUCKET_NAME}/empty.txt")
|
|
try:
|
|
s3_client.put_object(
|
|
Bucket=BUCKET_NAME,
|
|
Key='empty.txt',
|
|
Body=b'',
|
|
ContentType='text/plain'
|
|
)
|
|
logger.info(" ✓ Created empty file")
|
|
except Exception as e:
|
|
logger.error(f" ✗ Failed to create: {e}")
|
|
return False
|
|
|
|
# Test HEAD
|
|
logger.info(f"\nTesting HEAD on: {BUCKET_NAME}/empty.txt")
|
|
try:
|
|
response = s3_client.head_object(Bucket=BUCKET_NAME, Key='empty.txt')
|
|
logger.info(f" ✓ HEAD returned 200 (expected for empty file)")
|
|
logger.info(f" Content-Type: {response.get('ContentType', 'N/A')}")
|
|
|
|
# Verify s3fs doesn't think it's a directory
|
|
info = fs.info(f"{BUCKET_NAME}/empty.txt")
|
|
if info.get('type') == 'file':
|
|
logger.info(" ✓ s3fs correctly identified as file")
|
|
return True
|
|
else:
|
|
logger.warning(f" ⚠️ s3fs identified as: {info.get('type')}")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f" ✗ Error: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Run all tests."""
|
|
logger.info("="*80)
|
|
logger.info("Implicit Directory Fix Test Suite")
|
|
logger.info("="*80)
|
|
logger.info(f"Endpoint: {S3_ENDPOINT_URL}")
|
|
logger.info(f"Bucket: {BUCKET_NAME}")
|
|
logger.info("="*80)
|
|
|
|
# Set up S3 clients
|
|
fs, s3_client = setup_s3()
|
|
|
|
# Create bucket if it doesn't exist
|
|
try:
|
|
s3_client.create_bucket(Bucket=BUCKET_NAME)
|
|
logger.info(f"\n✓ Created bucket: {BUCKET_NAME}")
|
|
except ClientError as e:
|
|
error_code = e.response['Error']['Code']
|
|
if error_code in ['BucketAlreadyOwnedByYou', 'BucketAlreadyExists']:
|
|
logger.info(f"\n✓ Bucket already exists: {BUCKET_NAME}")
|
|
else:
|
|
logger.error(f"\n✗ Failed to create bucket: {e}")
|
|
return 1
|
|
|
|
# Run tests
|
|
results = []
|
|
|
|
results.append(("Implicit Directory HEAD", test_implicit_directory_head_behavior(fs, s3_client)))
|
|
results.append(("s3fs Directory Detection", test_s3fs_directory_detection(fs)))
|
|
results.append(("s3fs.isdir() Method", test_s3fs_isdir(fs)))
|
|
results.append(("PyArrow Dataset Read", test_pyarrow_dataset_read(fs)))
|
|
results.append(("Explicit Directory Marker", test_explicit_directory_marker(fs, s3_client)))
|
|
results.append(("Empty File Not Directory", test_empty_file_not_directory(fs, s3_client)))
|
|
|
|
# Print summary
|
|
logger.info("\n" + "="*80)
|
|
logger.info("TEST SUMMARY")
|
|
logger.info("="*80)
|
|
|
|
passed = sum(1 for _, result in results if result)
|
|
total = len(results)
|
|
|
|
for name, result in results:
|
|
status = "✓ PASS" if result else "✗ FAIL"
|
|
logger.info(f"{status}: {name}")
|
|
|
|
logger.info("="*80)
|
|
logger.info(f"Results: {passed}/{total} tests passed")
|
|
logger.info("="*80)
|
|
|
|
if passed == total:
|
|
logger.info("\n🎉 All tests passed! The implicit directory fix is working correctly.")
|
|
return 0
|
|
else:
|
|
logger.warning(f"\n⚠️ {total - passed} test(s) failed. The fix may not be fully working.")
|
|
return 1
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|
|
|