Browse Source

feat: proactive download - grab files BEFORE Spark deletes them

BREAKTHROUGH STRATEGY: Don't wait for error, download files proactively!

The problem:
- Waiting for EOF error is too slow
- By the time we extract chunk ID, Spark has deleted the file
- Volume garbage collection removes chunks quickly

The solution:
1. Monitor for 'Running seaweed.spark.SparkSQLTest' in logs
2. Sleep 5 seconds (let test write files)
3. Download ALL files from /test-spark/employees/ immediately
4. Keep files for analysis when EOF occurs

This downloads files while they still exist, BEFORE Spark cleanup!

Timeline:
  Write → Download (NEW!) → Read → EOF Error → Analyze
Instead of:
  Write → Read → EOF Error → Try to download (file gone!) 

This will finally capture the actual problematic file!
pull/7526/head
chrislu 1 week ago
parent
commit
64357e73bf
  1. 183
      .github/workflows/spark-integration-tests.yml

183
.github/workflows/spark-integration-tests.yml

@ -127,123 +127,88 @@ jobs:
echo "Real-time monitoring: Will download file the instant EOF error appears..."
# Monitor logs and download chunk data DIRECTLY from volume server
# Monitor logs and download ALL employees files BEFORE they're deleted
(
DOWNLOADED=false
while docker ps | grep -q seaweedfs-spark-tests; do
# Check if we've reached the SQL test (where employees files are created)
if docker compose logs spark-tests 2>&1 | grep -q "Running seaweed.spark.SparkSQLTest"; then
if [ "$DOWNLOADED" = "false" ]; then
echo ""
echo "=== SparkSQLTest started! Waiting for employees file creation ==="
sleep 5 # Give it time to write the file
# List all files in employees directory
echo "Listing employees directory..."
EMPLOYEES_FILES=$(curl -s "http://localhost:8888/test-spark/employees/" | grep -oP 'part-[a-f0-9-]+-c000\.snappy\.parquet')
if [ -n "$EMPLOYEES_FILES" ]; then
echo "Found employees files, downloading ALL of them..."
for FILE in $EMPLOYEES_FILES; do
echo "Downloading: $FILE"
curl -o "${FILE}" "http://localhost:8888/test-spark/employees/${FILE}"
if [ -f "$FILE" ] && [ -s "$FILE" ]; then
echo "SUCCESS: Downloaded $(stat --format=%s "$FILE" 2>/dev/null || stat -f%z "$FILE" 2>/dev/null) bytes"
cp "$FILE" test.parquet # Use first file for analysis
DOWNLOADED=true
fi
done
fi
fi
fi
# Check if EOF error has appeared
if docker compose logs spark-tests 2>&1 | grep -q "EOFException.*Still have: 78 bytes left"; then
echo ""
echo "=== EOF ERROR DETECTED! Extracting chunk data ==="
# Get the full log and extract the EXACT file causing the error
FULL_LOG=$(docker compose logs spark-tests 2>&1)
# Extract the failing filename from the error message
# Look for "Encountered error while reading file seaweedfs://...part-xxx-c000.snappy.parquet"
FAILING_FILE=$(echo "$FULL_LOG" | grep "Encountered error while reading file" | grep -oP 'part-[a-f0-9-]+-c000\.snappy\.parquet' | head -1)
echo "Failing file: $FAILING_FILE"
# Also show the full error line for debugging
echo "Full error context:"
echo "$FULL_LOG" | grep "Encountered error while reading file" | head -1
if [ -z "$FAILING_FILE" ]; then
echo "ERROR: Could not extract failing filename from error message"
echo "Searching for error message pattern..."
echo "$FULL_LOG" | grep -A 2 "EOFException.*78 bytes" | head -20
break
fi
# Now find the chunk info for THIS SPECIFIC FILE
# The file is being READ when the error occurs, so look for SeaweedInputStream opening it
echo "Searching logs for when $FAILING_FILE was opened for reading..."
# Find all instances where this file is mentioned and get nearby chunk info
# Strategy: Search for the filename, then look for "chunks {" blocks near it
CHUNK_CONTEXT=$(echo "$FULL_LOG" | grep -A 100 "new path:.*$FAILING_FILE")
echo "=== EOF ERROR DETECTED! ==="
if [ -n "$CHUNK_CONTEXT" ]; then
echo "Found read context for file"
CHUNK_ID=$(echo "$CHUNK_CONTEXT" | head -30 | grep ' file_id: "' | head -1 | grep -oP '"\K[^"]+')
else
echo "No read context, trying write context..."
# Maybe it's in the write logs
CHUNK_CONTEXT=$(echo "$FULL_LOG" | grep -B 50 -A 20 "$FAILING_FILE")
CHUNK_ID=$(echo "$CHUNK_CONTEXT" | grep ' file_id: "' | head -1 | grep -oP '"\K[^"]+')
fi
echo "Found chunk ID: $CHUNK_ID"
if [ -n "$CHUNK_ID" ]; then
# Download directly from volume server (data persists even after filer metadata deleted)
echo "Downloading chunk from volume server: http://localhost:8080/$CHUNK_ID"
curl -v -o test.parquet "http://localhost:8080/$CHUNK_ID"
if [ "$DOWNLOADED" = "true" ] && [ -f test.parquet ] && [ -s test.parquet ]; then
echo "File was already downloaded proactively!"
FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null)
echo "File size: $FILE_SIZE bytes"
if [ -f test.parquet ] && [ -s test.parquet ]; then
FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null)
echo "SUCCESS: Downloaded $FILE_SIZE bytes from volume!"
ls -lh test.parquet
# Quick analysis
echo ""
echo "Installing parquet-tools..."
pip3 install -q parquet-tools
echo ""
echo "=== File Header (first 100 bytes) ==="
hexdump -C test.parquet | head -10
echo ""
echo "=== File Footer (last 200 bytes) ==="
tail -c 200 test.parquet | hexdump -C
echo ""
echo "=== Magic bytes check ==="
echo "First 4 bytes (should be PAR1):"
head -c 4 test.parquet | xxd
echo "Last 4 bytes (should be PAR1):"
tail -c 4 test.parquet | xxd
echo ""
echo "=== Parquet metadata ==="
parquet-tools inspect test.parquet || echo "parquet-tools inspect failed"
echo ""
echo "=== Try reading data ==="
parquet-tools show test.parquet || echo "parquet-tools show failed"
echo ""
echo "=== CRITICAL ANALYSIS: Where are the missing 78 bytes? ==="
echo "Actual file size: $FILE_SIZE bytes"
# Parse footer to find what size Parquet thinks the file should be
echo ""
echo "Reading footer length (last 8 bytes)..."
FOOTER_LEN_HEX=$(tail -c 8 test.parquet | head -c 4 | xxd -p)
echo "Footer length (hex): $FOOTER_LEN_HEX"
# Get the highest offset from column metadata
echo ""
echo "Examining column chunk offsets from metadata..."
parquet-tools meta test.parquet > meta.txt 2>&1 || true
cat meta.txt
echo ""
echo "Analyzing offset pattern..."
grep -i "offset" meta.txt || echo "No offset info"
echo ""
echo "Expected file size based on Parquet metadata:"
echo " If Parquet reader expects $((FILE_SIZE + 78)) bytes,"
echo " then column chunks claim offsets beyond actual data"
echo ""
echo "=== Download the file as artifact for local analysis ==="
ls -lh test.parquet
else
echo "FAILED: Could not download chunk"
fi
# Analyze it
echo ""
echo "Installing parquet-tools..."
pip3 install -q parquet-tools
echo ""
echo "=== File Header (first 100 bytes) ==="
hexdump -C test.parquet | head -10
echo ""
echo "=== File Footer (last 200 bytes) ==="
tail -c 200 test.parquet | hexdump -C
echo ""
echo "=== Magic bytes check ==="
echo "First 4 bytes (should be PAR1):"
head -c 4 test.parquet | xxd
echo "Last 4 bytes (should be PAR1):"
tail -c 4 test.parquet | xxd
echo ""
echo "=== Parquet metadata ==="
parquet-tools inspect test.parquet || echo "parquet-tools inspect failed"
echo ""
echo "=== Try reading data ==="
parquet-tools show test.parquet || echo "parquet-tools show failed"
echo ""
echo "=== CRITICAL ANALYSIS: Where are the missing 78 bytes? ==="
echo "Actual file size: $FILE_SIZE bytes"
echo ""
echo "Examining column chunk offsets from metadata..."
parquet-tools meta test.parquet > meta.txt 2>&1 || true
cat meta.txt
echo ""
echo "Analyzing offset pattern..."
grep -i "offset" meta.txt || echo "No offset info"
else
echo "ERROR: Could not extract chunk ID from logs"
echo "ERROR: File was not downloaded proactively!"
fi
break
fi

Loading…
Cancel
Save