#!/bin/bash set -e echo "=== Testing if Parquet file written by Spark can be read by parquet-tools ===" # Run the test to write a Parquet file echo "1. Writing Parquet file with Spark..." docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c ' cd /workspace mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery -q 2>&1 | tail -5 ' > /tmp/write_test.log 2>&1 || true # Find the Parquet file that was written echo "2. Finding Parquet file..." PARQUET_FILE=$(docker compose run --rm spark-tests bash -c ' curl -s "http://seaweedfs-filer:8888/test-spark/employees/?pretty=y" | grep -oP "\"name\":\s*\"\K[^\"]+\.parquet" | head -1 ' 2>&1 | grep -v "Creating" | grep "\.parquet" | head -1) if [ -z "$PARQUET_FILE" ]; then echo "ERROR: No Parquet file found!" exit 1 fi echo "Found file: $PARQUET_FILE" # Download the file echo "3. Downloading file from SeaweedFS..." curl -s "http://localhost:8888/test-spark/employees/$PARQUET_FILE" -o /tmp/test.parquet if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then echo "ERROR: Failed to download file!" exit 1 fi FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null) echo "Downloaded $FILE_SIZE bytes" # Try to read with parquet-tools echo "4. Reading with parquet-tools..." pip3 install -q parquet-tools 2>&1 | grep -v "Requirement already satisfied" || true echo "" echo "=== Parquet Metadata ===" parquet-tools inspect /tmp/test.parquet 2>&1 || echo "FAILED to inspect" echo "" echo "=== Try to read data ===" parquet-tools show /tmp/test.parquet 2>&1 || echo "FAILED to read data" echo "" echo "=== Conclusion ===" if parquet-tools show /tmp/test.parquet > /dev/null 2>&1; then echo "✅ SUCCESS: File can be read by parquet-tools!" echo "The file itself is VALID Parquet format." echo "The issue is specific to how Spark reads it back." else echo "❌ FAILED: File cannot be read by parquet-tools" echo "The file is CORRUPTED or has invalid Parquet format." fi