From e76107c22e0098d6c4500aa76d12435c9e0c1f98 Mon Sep 17 00:00:00 2001 From: chrislu Date: Sun, 23 Nov 2025 20:03:28 -0800 Subject: [PATCH] fix: extract chunk ID for the EXACT file causing EOF error CRITICAL FIX: We were downloading the wrong file! The issue: - EOF error is for: test-spark/employees/part-00000-xxx.parquet - But logs contain MULTIPLE files (employees_window with 1275 bytes, etc.) - grep -B 50 was matching chunk info from OTHER files The solution: 1. Extract the EXACT failing filename from EOF error message 2. Search logs for chunk info specifically for THAT file 3. Download the correct chunk Example: - EOF error mentions: part-00000-32cafb4f-82c4-436e-a22a-ebf2f5cb541e-c000.snappy.parquet - Find chunk info for this specific file, not other files in logs Now we'll download the actual problematic file, not a random one! --- .github/workflows/spark-integration-tests.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/spark-integration-tests.yml b/.github/workflows/spark-integration-tests.yml index 8a2114b80..5d3792388 100644 --- a/.github/workflows/spark-integration-tests.yml +++ b/.github/workflows/spark-integration-tests.yml @@ -135,13 +135,16 @@ jobs: echo "" echo "=== EOF ERROR DETECTED! Extracting chunk data ===" - # Get the last file that was read before the error - # Look for the entry with chunks info right before the error + # Get the full log and extract the EXACT file causing the error FULL_LOG=$(docker compose logs spark-tests 2>&1) - # Extract chunk file_id (format: "7,d0364fd01") - # Look for the line "file_id: " but NOT "source_file_id: " - CHUNK_ID=$(echo "$FULL_LOG" | grep -B 50 "EOFException" | grep ' file_id: "' | tail -1 | grep -oP '"\K[^"]+') + # Extract the failing filename from the EOF error message + FAILING_FILE=$(echo "$FULL_LOG" | grep "EOFException" | grep -oP 'test-spark/employees/\K[^"]+\.parquet' | head -1) + echo "Failing file: $FAILING_FILE" + + # Now find the chunk info for THIS SPECIFIC FILE + # Search backwards from the filename to find its chunk info + CHUNK_ID=$(echo "$FULL_LOG" | grep -B 200 "$FAILING_FILE" | grep 'chunks {' -A 10 | grep ' file_id: "' | tail -1 | grep -oP '"\K[^"]+') echo "Found chunk ID: $CHUNK_ID" if [ -n "$CHUNK_ID" ]; then