From e76107c22e0098d6c4500aa76d12435c9e0c1f98 Mon Sep 17 00:00:00 2001
From: chrislu <chris.lu@gmail.com>
Date: Sun, 23 Nov 2025 20:03:28 -0800
Subject: [PATCH] fix: extract chunk ID for the EXACT file causing EOF error

CRITICAL FIX: We were downloading the wrong file!

The issue:
- EOF error is for: test-spark/employees/part-00000-xxx.parquet
- But logs contain MULTIPLE files (employees_window with 1275 bytes, etc.)
- grep -B 50 was matching chunk info from OTHER files

The solution:
1. Extract the EXACT failing filename from EOF error message
2. Search logs for chunk info specifically for THAT file
3. Download the correct chunk

Example:
- EOF error mentions: part-00000-32cafb4f-82c4-436e-a22a-ebf2f5cb541e-c000.snappy.parquet
- Find chunk info for this specific file, not other files in logs

Now we'll download the actual problematic file, not a random one!
---
 .github/workflows/spark-integration-tests.yml | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/spark-integration-tests.yml b/.github/workflows/spark-integration-tests.yml
index 8a2114b80..5d3792388 100644
--- a/.github/workflows/spark-integration-tests.yml
+++ b/.github/workflows/spark-integration-tests.yml
@@ -135,13 +135,16 @@ jobs:
               echo ""
               echo "=== EOF ERROR DETECTED! Extracting chunk data ==="
               
-              # Get the last file that was read before the error
-              # Look for the entry with chunks info right before the error
+              # Get the full log and extract the EXACT file causing the error
               FULL_LOG=$(docker compose logs spark-tests 2>&1)
               
-              # Extract chunk file_id (format: "7,d0364fd01")
-              # Look for the line "file_id: " but NOT "source_file_id: "
-              CHUNK_ID=$(echo "$FULL_LOG" | grep -B 50 "EOFException" | grep '  file_id: "' | tail -1 | grep -oP '"\K[^"]+')
+              # Extract the failing filename from the EOF error message
+              FAILING_FILE=$(echo "$FULL_LOG" | grep "EOFException" | grep -oP 'test-spark/employees/\K[^"]+\.parquet' | head -1)
+              echo "Failing file: $FAILING_FILE"
+              
+              # Now find the chunk info for THIS SPECIFIC FILE
+              # Search backwards from the filename to find its chunk info
+              CHUNK_ID=$(echo "$FULL_LOG" | grep -B 200 "$FAILING_FILE" | grep 'chunks {' -A 10 | grep '  file_id: "' | tail -1 | grep -oP '"\K[^"]+')
               echo "Found chunk ID: $CHUNK_ID"
               
               if [ -n "$CHUNK_ID" ]; then