You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

120 lines
3.1 KiB

#!/bin/bash
set -e
echo "=== Testing Parquet file with multiple readers ==="
echo ""
# Start services
docker compose up -d 2>&1 | grep -v "Running"
sleep 2
# Run test and capture chunk ID
echo "1. Writing Parquet file and capturing chunk ID..."
docker compose run --rm -e SEAWEEDFS_TEST_ENABLED=true spark-tests bash -c '
cd /workspace
mvn test -Dtest=SparkSQLTest#testCreateTableAndQuery 2>&1
' 2>&1 | tee /tmp/test_output.log | tail -20 &
TEST_PID=$!
# Wait for the file to be written
echo "2. Waiting for file write..."
sleep 10
# Extract chunk ID from logs
CHUNK_ID=$(grep "PARQUET FILE WRITTEN TO EMPLOYEES" /tmp/test_output.log | grep -oP 'CHUNKS: \[\K[^\]]+' | head -1)
if [ -z "$CHUNK_ID" ]; then
echo "Waiting more..."
sleep 5
CHUNK_ID=$(grep "PARQUET FILE WRITTEN TO EMPLOYEES" /tmp/test_output.log | grep -oP 'CHUNKS: \[\K[^\]]+' | head -1)
fi
if [ -z "$CHUNK_ID" ]; then
echo "ERROR: Could not find chunk ID in logs"
echo "Log excerpt:"
grep -E "PARQUET|CHUNKS|employees" /tmp/test_output.log | tail -20
kill $TEST_PID 2>/dev/null || true
exit 1
fi
echo "Found chunk ID: $CHUNK_ID"
# Download directly from volume server
echo "3. Downloading from volume server..."
curl -s "http://localhost:8080/$CHUNK_ID" -o /tmp/test.parquet
if [ ! -f /tmp/test.parquet ] || [ ! -s /tmp/test.parquet ]; then
echo "ERROR: Download failed!"
exit 1
fi
FILE_SIZE=$(stat -f%z /tmp/test.parquet 2>/dev/null || stat --format=%s /tmp/test.parquet 2>/dev/null)
echo "Downloaded: $FILE_SIZE bytes"
echo ""
# Kill test process
kill $TEST_PID 2>/dev/null || true
wait $TEST_PID 2>/dev/null || true
# Test with readers
echo "=== Testing with Multiple Parquet Readers ==="
echo ""
# Check magic bytes
echo "1. Magic Bytes:"
FIRST=$(head -c 4 /tmp/test.parquet | xxd -p)
LAST=$(tail -c 4 /tmp/test.parquet | xxd -p)
echo " First 4 bytes: $FIRST"
echo " Last 4 bytes: $LAST"
if [ "$FIRST" = "50415231" ] && [ "$LAST" = "50415231" ]; then
echo " ✅ Valid PAR1 magic"
else
echo " ❌ Invalid magic!"
fi
echo ""
# Python pyarrow
echo "2. Python pyarrow:"
python3 -c "
import pyarrow.parquet as pq
try:
table = pq.read_table('/tmp/test.parquet')
print(f' ✅ Read {table.num_rows} rows, {table.num_columns} columns')
print(f' Data: {table.to_pandas().to_dict(\"records\")}')
except Exception as e:
print(f' ❌ FAILED: {e}')
" 2>&1
echo ""
# Pandas
echo "3. Pandas:"
python3 -c "
import pandas as pd
try:
df = pd.read_parquet('/tmp/test.parquet')
print(f' ✅ Read {len(df)} rows')
print(f' Data:\n{df}')
except Exception as e:
print(f' ❌ FAILED: {e}')
" 2>&1
echo ""
# DuckDB
echo "4. DuckDB:"
python3 -c "
import duckdb
try:
conn = duckdb.connect(':memory:')
result = conn.execute('SELECT * FROM \"/tmp/test.parquet\"').fetchall()
print(f' ✅ Read {len(result)} rows')
print(f' Data: {result}')
except Exception as e:
print(f' ❌ FAILED: {e}')
" 2>&1
echo ""
echo "=== Summary ==="
echo "File: $FILE_SIZE bytes"
echo "If readers succeeded: File is VALID ✅"
echo "If readers failed: Footer metadata is corrupted ❌"