From b767825ba00b2ad39d6937123d7c4b61a1aa594b Mon Sep 17 00:00:00 2001
From: chrislu <chris.lu@gmail.com>
Date: Sun, 23 Nov 2025 13:42:39 -0800
Subject: [PATCH] test: add Parquet file download and inspection on failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added diagnostic step to download and examine actual Parquet files
when tests fail. This will definitively answer:

1. Is the file complete? (Check PAR1 magic bytes at start/end)
2. What size is it? (Compare actual vs expected)
3. Can parquet-tools read it? (Reader compatibility test)
4. What does the footer contain? (Hex dump last 200 bytes)

Steps performed:
- List files in SeaweedFS
- Download first Parquet file
- Check magic bytes (PAR1 at offset 0 and EOF-4)
- Show file size from filesystem
- Hex dump header (first 100 bytes)
- Hex dump footer (last 200 bytes)
- Run parquet-tools inspect/show
- Upload file as artifact for local analysis

This will reveal if the issue is:
A) File is incomplete (missing trailer) → SeaweedFS write problem
B) File is complete but unreadable → Parquet format problem
C) File is complete and readable → SeaweedFS read problem
D) File size doesn't match metadata → Footer offset problem

The downloaded file will be available as 'failed-parquet-file' artifact.
---
 .github/workflows/spark-integration-tests.yml | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/.github/workflows/spark-integration-tests.yml b/.github/workflows/spark-integration-tests.yml
index 9d2524d33..509de0a10 100644
--- a/.github/workflows/spark-integration-tests.yml
+++ b/.github/workflows/spark-integration-tests.yml
@@ -125,6 +125,71 @@ jobs:
         docker compose up --abort-on-container-exit --exit-code-from spark-tests spark-tests
         echo "✓ Tests completed"
 
+    - name: Download and examine Parquet files
+      if: failure()
+      working-directory: test/java/spark
+      run: |
+        echo "=== Downloading Parquet files for analysis ==="
+        
+        # Install parquet-tools
+        pip3 install parquet-tools
+        
+        # List available files
+        echo "Available Parquet files:"
+        curl -s http://localhost:8888/test-spark/employees/?pretty=y | tee files.json
+        
+        # Download a Parquet file
+        PARQUET_FILE=$(curl -s http://localhost:8888/test-spark/employees/?pretty=y | grep -o 'part-[^"]*\.parquet' | head -1)
+        if [ -n "$PARQUET_FILE" ]; then
+          echo "Downloading: $PARQUET_FILE"
+          curl -o test.parquet "http://localhost:8888/test-spark/employees/$PARQUET_FILE"
+          
+          echo ""
+          echo "=== File Size ==="
+          ls -lh test.parquet
+          FILE_SIZE=$(stat -f%z test.parquet 2>/dev/null || stat -c%s test.parquet)
+          echo "Actual file size: $FILE_SIZE bytes"
+          
+          echo ""
+          echo "=== File Header (first 100 bytes) ==="
+          hexdump -C test.parquet | head -10
+          
+          echo ""
+          echo "=== File Footer (last 200 bytes) ==="
+          tail -c 200 test.parquet | hexdump -C
+          
+          echo ""
+          echo "=== Magic Bytes Check ==="
+          echo "First 4 bytes (should be PAR1):"
+          head -c 4 test.parquet | xxd
+          echo "Last 4 bytes (should be PAR1):"
+          tail -c 4 test.parquet | xxd
+          
+          echo ""
+          echo "=== Parquet Metadata ==="
+          parquet-tools inspect test.parquet || echo "parquet-tools failed"
+          
+          echo ""
+          echo "=== Try Reading with Parquet Tools ==="
+          parquet-tools show test.parquet || echo "Failed to read file"
+          
+          echo ""
+          echo "=== File appears to be: ==="
+          if head -c 4 test.parquet | grep -q "PAR1"; then
+            echo "✓ Valid Parquet header"
+          else
+            echo "✗ INVALID Parquet header"
+          fi
+          
+          if tail -c 4 test.parquet | grep -q "PAR1"; then
+            echo "✓ Valid Parquet trailer"
+          else
+            echo "✗ INVALID Parquet trailer"
+          fi
+        else
+          echo "No Parquet files found"
+        fi
+
     - name: Stop test services
       if: always()
       working-directory: test/java/spark
@@ -138,6 +203,15 @@ jobs:
         path: test/java/spark/target/surefire-reports/
         retention-days: 30
 
+    - name: Upload Parquet file for analysis
+      if: failure()
+      uses: actions/upload-artifact@v4
+      with:
+        name: failed-parquet-file
+        path: test/java/spark/test.parquet
+        retention-days: 7
+        if-no-files-found: ignore
+
     - name: Publish test report
       if: always()
       uses: dorny/test-reporter@v1