From b767825ba00b2ad39d6937123d7c4b61a1aa594b Mon Sep 17 00:00:00 2001 From: chrislu Date: Sun, 23 Nov 2025 13:42:39 -0800 Subject: [PATCH] test: add Parquet file download and inspection on failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added diagnostic step to download and examine actual Parquet files when tests fail. This will definitively answer: 1. Is the file complete? (Check PAR1 magic bytes at start/end) 2. What size is it? (Compare actual vs expected) 3. Can parquet-tools read it? (Reader compatibility test) 4. What does the footer contain? (Hex dump last 200 bytes) Steps performed: - List files in SeaweedFS - Download first Parquet file - Check magic bytes (PAR1 at offset 0 and EOF-4) - Show file size from filesystem - Hex dump header (first 100 bytes) - Hex dump footer (last 200 bytes) - Run parquet-tools inspect/show - Upload file as artifact for local analysis This will reveal if the issue is: A) File is incomplete (missing trailer) → SeaweedFS write problem B) File is complete but unreadable → Parquet format problem C) File is complete and readable → SeaweedFS read problem D) File size doesn't match metadata → Footer offset problem The downloaded file will be available as 'failed-parquet-file' artifact. --- .github/workflows/spark-integration-tests.yml | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/.github/workflows/spark-integration-tests.yml b/.github/workflows/spark-integration-tests.yml index 9d2524d33..509de0a10 100644 --- a/.github/workflows/spark-integration-tests.yml +++ b/.github/workflows/spark-integration-tests.yml @@ -125,6 +125,71 @@ jobs: docker compose up --abort-on-container-exit --exit-code-from spark-tests spark-tests echo "✓ Tests completed" + - name: Download and examine Parquet files + if: failure() + working-directory: test/java/spark + run: | + echo "=== Downloading Parquet files for analysis ===" + + # Install parquet-tools + pip3 install parquet-tools + + # List available files + echo "Available Parquet files:" + curl -s http://localhost:8888/test-spark/employees/?pretty=y | tee files.json + + # Download a Parquet file + PARQUET_FILE=$(curl -s http://localhost:8888/test-spark/employees/?pretty=y | grep -o 'part-[^"]*\.parquet' | head -1) + if [ -n "$PARQUET_FILE" ]; then + echo "Downloading: $PARQUET_FILE" + curl -o test.parquet "http://localhost:8888/test-spark/employees/$PARQUET_FILE" + + echo "" + echo "=== File Size ===" + ls -lh test.parquet + FILE_SIZE=$(stat -f%z test.parquet 2>/dev/null || stat -c%s test.parquet) + echo "Actual file size: $FILE_SIZE bytes" + + echo "" + echo "=== File Header (first 100 bytes) ===" + hexdump -C test.parquet | head -10 + + echo "" + echo "=== File Footer (last 200 bytes) ===" + tail -c 200 test.parquet | hexdump -C + + echo "" + echo "=== Magic Bytes Check ===" + echo "First 4 bytes (should be PAR1):" + head -c 4 test.parquet | xxd + echo "Last 4 bytes (should be PAR1):" + tail -c 4 test.parquet | xxd + + echo "" + echo "=== Parquet Metadata ===" + parquet-tools inspect test.parquet || echo "parquet-tools failed" + + echo "" + echo "=== Try Reading with Parquet Tools ===" + parquet-tools show test.parquet || echo "Failed to read file" + + echo "" + echo "=== File appears to be: ===" + if head -c 4 test.parquet | grep -q "PAR1"; then + echo "✓ Valid Parquet header" + else + echo "✗ INVALID Parquet header" + fi + + if tail -c 4 test.parquet | grep -q "PAR1"; then + echo "✓ Valid Parquet trailer" + else + echo "✗ INVALID Parquet trailer" + fi + else + echo "No Parquet files found" + fi + - name: Stop test services if: always() working-directory: test/java/spark @@ -138,6 +203,15 @@ jobs: path: test/java/spark/target/surefire-reports/ retention-days: 30 + - name: Upload Parquet file for analysis + if: failure() + uses: actions/upload-artifact@v4 + with: + name: failed-parquet-file + path: test/java/spark/test.parquet + retention-days: 7 + if-no-files-found: ignore + - name: Publish test report if: always() uses: dorny/test-reporter@v1