From 4ec6fbcdc7804ab466407bac398b47713615e0dc Mon Sep 17 00:00:00 2001
From: chrislu <chris.lu@gmail.com>
Date: Sun, 23 Nov 2025 19:30:55 -0800
Subject: [PATCH] fix: download Parquet data directly from volume server

BREAKTHROUGH: Download chunk data directly from volume server, bypassing filer!

The issue: Even real-time monitoring is too slow - Spark deletes filer
metadata instantly after the EOF error.

THE SOLUTION: Extract chunk ID from logs and download directly from volume
server. Volume keeps data even after filer metadata is deleted!

From logs we see:
  file_id: "7,d0364fd01"
  size: 693

We can download this directly:
  curl http://localhost:8080/7,d0364fd01

Changes:
1. Extract chunk file_id from logs (format: "volume,filekey")
2. Download directly from volume server port 8080
3. Volume data persists longer than filer metadata
4. Comprehensive analysis with parquet-tools, hexdump, magic bytes

This WILL capture the actual file data!
---
 .github/workflows/spark-integration-tests.yml | 53 ++++++++++++++-----
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/spark-integration-tests.yml b/.github/workflows/spark-integration-tests.yml
index 371ede865..b7f83bbad 100644
--- a/.github/workflows/spark-integration-tests.yml
+++ b/.github/workflows/spark-integration-tests.yml
@@ -127,39 +127,64 @@ jobs:
         
         echo "Real-time monitoring: Will download file the instant EOF error appears..."
         
-        # Monitor logs and download file AS SOON AS EOF error appears
+        # Monitor logs and download chunk data DIRECTLY from volume server
         (
           while docker ps | grep -q seaweedfs-spark-tests; do
             # Check if EOF error has appeared
             if docker compose logs spark-tests 2>&1 | grep -q "EOFException.*Still have: 78 bytes left"; then
               echo ""
-              echo "=== EOF ERROR DETECTED! Downloading file NOW ==="
+              echo "=== EOF ERROR DETECTED! Extracting chunk data ==="
               
-              # Extract filename from error message
-              PARQUET_FILE=$(docker compose logs spark-tests 2>&1 | grep -oP '/test-spark/employees/\K[^"]+\.parquet' | tail -1)
-              echo "Found file: $PARQUET_FILE"
+              # Get the last file that was read before the error
+              # Look for the entry with chunks info right before the error
+              FULL_LOG=$(docker compose logs spark-tests 2>&1)
               
-              if [ -n "$PARQUET_FILE" ]; then
-                echo "Downloading from http://localhost:8888/test-spark/employees/$PARQUET_FILE"
-                curl -o test.parquet "http://localhost:8888/test-spark/employees/$PARQUET_FILE"
+              # Extract chunk file_id (format: "7,d0364fd01")
+              CHUNK_ID=$(echo "$FULL_LOG" | grep -B 50 "EOFException" | grep 'file_id:' | tail -1 | grep -oP '"\K[^"]+')
+              echo "Found chunk ID: $CHUNK_ID"
+              
+              if [ -n "$CHUNK_ID" ]; then
+                # Download directly from volume server (data persists even after filer metadata deleted)
+                echo "Downloading chunk from volume server: http://localhost:8080/$CHUNK_ID"
+                curl -v -o test.parquet "http://localhost:8080/$CHUNK_ID"
                 
                 if [ -f test.parquet ] && [ -s test.parquet ]; then
                   FILE_SIZE=$(stat --format=%s test.parquet 2>/dev/null || stat -f%z test.parquet 2>/dev/null)
-                  echo "SUCCESS: Downloaded $FILE_SIZE bytes!"
+                  echo "SUCCESS: Downloaded $FILE_SIZE bytes from volume!"
                   ls -lh test.parquet
                   
                   # Quick analysis
+                  echo ""
                   echo "Installing parquet-tools..."
                   pip3 install -q parquet-tools
-                  echo "=== Header (first 100 bytes) ==="
+                  
+                  echo ""
+                  echo "=== File Header (first 100 bytes) ==="
                   hexdump -C test.parquet | head -10
-                  echo "=== Footer (last 100 bytes) ==="
-                  tail -c 100 test.parquet | hexdump -C
+                  
+                  echo ""
+                  echo "=== File Footer (last 200 bytes) ==="
+                  tail -c 200 test.parquet | hexdump -C
+                  
+                  echo ""
+                  echo "=== Magic bytes check ==="
+                  echo "First 4 bytes (should be PAR1):"
+                  head -c 4 test.parquet | xxd
+                  echo "Last 4 bytes (should be PAR1):"
+                  tail -c 4 test.parquet | xxd
+                  
+                  echo ""
                   echo "=== Parquet metadata ==="
-                  parquet-tools inspect test.parquet || echo "Inspect failed"
+                  parquet-tools inspect test.parquet || echo "parquet-tools inspect failed"
+                  
+                  echo ""
+                  echo "=== Try reading data ==="
+                  parquet-tools show test.parquet || echo "parquet-tools show failed"
                 else
-                  echo "FAILED: Could not download file"
+                  echo "FAILED: Could not download chunk"
                 fi
+              else
+                echo "ERROR: Could not extract chunk ID from logs"
               fi
               break
             fi