You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
2.8 KiB
86 lines
2.8 KiB
#!/usr/bin/env python3
|
|
"""Debug script to understand what pads.write_dataset creates."""
|
|
|
|
import sys
|
|
import pyarrow as pa
|
|
import pyarrow.dataset as pads
|
|
import s3fs
|
|
|
|
# Create a simple test table
|
|
table = pa.table({'id': [1, 2, 3], 'value': [1.0, 2.0, 3.0]})
|
|
|
|
# Initialize S3 filesystem
|
|
fs = s3fs.S3FileSystem(
|
|
client_kwargs={'endpoint_url': 'http://localhost:8333'},
|
|
key='some_access_key1',
|
|
secret='some_secret_key1',
|
|
use_listings_cache=False,
|
|
)
|
|
|
|
# Create bucket
|
|
if not fs.exists('test-bucket'):
|
|
fs.mkdir('test-bucket')
|
|
|
|
# Write with pads.write_dataset
|
|
test_path = 's3://test-bucket/test-write-simple/'
|
|
print(f"Writing to: {test_path}")
|
|
print(f"Table schema: {table.schema}")
|
|
print(f"Table rows: {table.num_rows}")
|
|
|
|
try:
|
|
pads.write_dataset(table, test_path, format='parquet', filesystem=fs)
|
|
print("\n✓ Write succeeded")
|
|
|
|
# List all files recursively
|
|
print(f"\nListing all files recursively under {test_path}:")
|
|
import os
|
|
base_path = 'test-bucket/test-write-simple'
|
|
def list_recursive(path, indent=0):
|
|
try:
|
|
items = fs.ls(path, detail=False)
|
|
for item in items:
|
|
is_dir = fs.isdir(item)
|
|
item_name = item.split('/')[-1] if '/' in item else item
|
|
if is_dir:
|
|
print(f"{' ' * indent}📁 {item_name}/")
|
|
list_recursive(item, indent + 1)
|
|
else:
|
|
# Get file size
|
|
try:
|
|
info = fs.info(item)
|
|
size = info.get('size', 0)
|
|
print(f"{' ' * indent}📄 {item_name} ({size} bytes)")
|
|
except:
|
|
print(f"{' ' * indent}📄 {item_name}")
|
|
except Exception as e:
|
|
print(f"{' ' * indent}Error listing {path}: {e}")
|
|
|
|
list_recursive(base_path)
|
|
|
|
# Try to read back with different methods
|
|
print(f"\n\nTrying to read back using different methods:")
|
|
|
|
# Method 1: pads.dataset with the same path
|
|
print(f"\n1. pads.dataset('{test_path}'):")
|
|
try:
|
|
dataset = pads.dataset(test_path, format='parquet', filesystem=fs)
|
|
result = dataset.to_table()
|
|
print(f" ✓ Success: {result.num_rows} rows")
|
|
except Exception as e:
|
|
print(f" ✗ Failed: {e}")
|
|
|
|
# Method 2: pads.dataset with the dir containing parquet files
|
|
print(f"\n2. pads.dataset without trailing slash:")
|
|
test_path_no_slash = 's3://test-bucket/test-write-simple'
|
|
try:
|
|
dataset = pads.dataset(test_path_no_slash, format='parquet', filesystem=fs)
|
|
result = dataset.to_table()
|
|
print(f" ✓ Success: {result.num_rows} rows")
|
|
except Exception as e:
|
|
print(f" ✗ Failed: {e}")
|
|
|
|
except Exception as e:
|
|
import traceback
|
|
print(f"✗ Error: {e}")
|
|
traceback.print_exc()
|
|
sys.exit(1)
|