test: add Parquet file download and inspection on failure

Added diagnostic step to download and examine actual Parquet files when tests fail. This will definitively answer: 1. Is the file complete? (Check PAR1 magic bytes at start/end) 2. What size is it? (Compare actual vs expected) 3. Can parquet-tools read it? (Reader compatibility test) 4. What does the footer contain? (Hex dump last 200 bytes) Steps performed: - List files in SeaweedFS - Download first Parquet file - Check magic bytes (PAR1 at offset 0 and EOF-4) - Show file size from filesystem - Hex dump header (first 100 bytes) - Hex dump footer (last 200 bytes) - Run parquet-tools inspect/show - Upload file as artifact for local analysis This will reveal if the issue is: A) File is incomplete (missing trailer) → SeaweedFS write problem B) File is complete but unreadable → Parquet format problem C) File is complete and readable → SeaweedFS read problem D) File size doesn't match metadata → Footer offset problem The downloaded file will be available as 'failed-parquet-file' artifact.
3 months ago · b767825ba0
1 changed files with 74 additions and 0 deletions
--- a/.github/workflows/spark-integration-tests.yml
+++ b/.github/workflows/spark-integration-tests.yml
@ -125,6 +125,71 @@ jobs:
        docker compose up --abort-on-container-exit --exit-code-from spark-tests spark-tests
        echo "✓ Tests completed"

+    - name: Download and examine Parquet files
+      if: failure()
+      working-directory: test/java/spark
+      run: |
+        echo "=== Downloading Parquet files for analysis ==="
+        
+        # Install parquet-tools
+        pip3 install parquet-tools
+        
+        # List available files
+        echo "Available Parquet files:"
+        curl -s http://localhost:8888/test-spark/employees/?pretty=y | tee files.json
+        
+        # Download a Parquet file
+        PARQUET_FILE=$(curl -s http://localhost:8888/test-spark/employees/?pretty=y | grep -o 'part-[^"]*\.parquet' | head -1)
+        if [ -n "$PARQUET_FILE" ]; then
+          echo "Downloading: $PARQUET_FILE"
+          curl -o test.parquet "http://localhost:8888/test-spark/employees/$PARQUET_FILE"
+          
+          echo ""
+          echo "=== File Size ==="
+          ls -lh test.parquet
+          FILE_SIZE=$(stat -f%z test.parquet 2>/dev/null || stat -c%s test.parquet)
+          echo "Actual file size: $FILE_SIZE bytes"
+          
+          echo ""
+          echo "=== File Header (first 100 bytes) ==="
+          hexdump -C test.parquet | head -10
+          
+          echo ""
+          echo "=== File Footer (last 200 bytes) ==="
+          tail -c 200 test.parquet | hexdump -C
+          
+          echo ""
+          echo "=== Magic Bytes Check ==="
+          echo "First 4 bytes (should be PAR1):"
+          head -c 4 test.parquet | xxd
+          echo "Last 4 bytes (should be PAR1):"
+          tail -c 4 test.parquet | xxd
+          
+          echo ""
+          echo "=== Parquet Metadata ==="
+          parquet-tools inspect test.parquet || echo "parquet-tools failed"
+          
+          echo ""
+          echo "=== Try Reading with Parquet Tools ==="
+          parquet-tools show test.parquet || echo "Failed to read file"
+          
+          echo ""
+          echo "=== File appears to be: ==="
+          if head -c 4 test.parquet | grep -q "PAR1"; then
+            echo "✓ Valid Parquet header"
+          else
+            echo "✗ INVALID Parquet header"
+          fi
+          
+          if tail -c 4 test.parquet | grep -q "PAR1"; then
+            echo "✓ Valid Parquet trailer"
+          else
+            echo "✗ INVALID Parquet trailer"
+          fi
+        else
+          echo "No Parquet files found"
+        fi
+
    - name: Stop test services
      if: always()
      working-directory: test/java/spark
@ -138,6 +203,15 @@ jobs:
        path: test/java/spark/target/surefire-reports/
        retention-days: 30

+    - name: Upload Parquet file for analysis
+      if: failure()
+      uses: actions/upload-artifact@v4
+      with:
+        name: failed-parquet-file
+        path: test/java/spark/test.parquet
+        retention-days: 7
+        if-no-files-found: ignore
+
    - name: Publish test report
      if: always()
      uses: dorny/test-reporter@v1