You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
60 lines
2.0 KiB
60 lines
2.0 KiB
#!/bin/bash
|
|
set -e
|
|
|
|
echo "=== Testing if Parquet file written by Spark can be read by parquet-tools ==="
|
|
|
|
# Run the test to write a Parquet file
|
|
echo "1. Writing Parquet file with Spark..."
|
|
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
|
|
cd /workspace
|
|
mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery -q 2>&1 | tail -5
|
|
' > /tmp/write_test.log 2>&1 || true
|
|
|
|
# Find the Parquet file that was written
|
|
echo "2. Finding Parquet file..."
|
|
PARQUET_FILE=$(docker compose run --rm spark-tests bash -c '
|
|
curl -s "http://seaweedfs-filer:8888/test-spark/employees/?pretty=y" | grep -oP "\"name\":\s*\"\K[^\"]+\.parquet" | head -1
|
|
' 2>&1 | grep -v "Creating" | grep "\.parquet" | head -1)
|
|
|
|
if [ -z "$PARQUET_FILE" ]; then
|
|
echo "ERROR: No Parquet file found!"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Found file: $PARQUET_FILE"
|
|
|
|
# Download the file
|
|
echo "3. Downloading file from SeaweedFS..."
|
|
curl -s "http://localhost:8888/test-spark/employees/$PARQUET_FILE" -o /tmp/test.parquet
|
|
|
|
if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then
|
|
echo "ERROR: Failed to download file!"
|
|
exit 1
|
|
fi
|
|
|
|
FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
|
|
echo "Downloaded $FILE_SIZE bytes"
|
|
|
|
# Try to read with parquet-tools
|
|
echo "4. Reading with parquet-tools..."
|
|
pip3 install -q parquet-tools 2>&1 | grep -v "Requirement already satisfied" || true
|
|
|
|
echo ""
|
|
echo "=== Parquet Metadata ==="
|
|
parquet-tools inspect /tmp/test.parquet 2>&1 || echo "FAILED to inspect"
|
|
|
|
echo ""
|
|
echo "=== Try to read data ==="
|
|
parquet-tools show /tmp/test.parquet 2>&1 || echo "FAILED to read data"
|
|
|
|
echo ""
|
|
echo "=== Conclusion ==="
|
|
if parquet-tools show /tmp/test.parquet > /dev/null 2>&1; then
|
|
echo "✅ SUCCESS: File can be read by parquet-tools!"
|
|
echo "The file itself is VALID Parquet format."
|
|
echo "The issue is specific to how Spark reads it back."
|
|
else
|
|
echo "❌ FAILED: File cannot be read by parquet-tools"
|
|
echo "The file is CORRUPTED or has invalid Parquet format."
|
|
fi
|
|
|