From 0afe330b4e1e01a5da989be929fc1b87be96a686 Mon Sep 17 00:00:00 2001 From: chrislu Date: Sun, 23 Nov 2025 19:58:34 -0800 Subject: [PATCH] feat: add detailed offset analysis for 78-byte discrepancy SUCCESS: File downloaded and readable! Now analyzing WHY Parquet expects 78 more bytes. Added analysis: 1. Parse footer length from last 8 bytes 2. Extract column chunk offsets from parquet-tools meta 3. Compare actual file size with expected size from metadata 4. Identify if offsets are pointing beyond actual data This will reveal: - Are column chunk offsets incorrectly calculated during write? - Is the footer claiming data that doesn't exist? - Where exactly are the missing 78 bytes supposed to be? The file is already uploaded as artifact for deeper local analysis. --- .github/workflows/spark-integration-tests.yml | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/.github/workflows/spark-integration-tests.yml b/.github/workflows/spark-integration-tests.yml index 55a7b1355..8a2114b80 100644 --- a/.github/workflows/spark-integration-tests.yml +++ b/.github/workflows/spark-integration-tests.yml @@ -181,6 +181,35 @@ jobs: echo "" echo "=== Try reading data ===" parquet-tools show test.parquet || echo "parquet-tools show failed" + + echo "" + echo "=== CRITICAL ANALYSIS: Where are the missing 78 bytes? ===" + echo "Actual file size: $FILE_SIZE bytes" + + # Parse footer to find what size Parquet thinks the file should be + echo "" + echo "Reading footer length (last 8 bytes)..." + FOOTER_LEN_HEX=$(tail -c 8 test.parquet | head -c 4 | xxd -p) + echo "Footer length (hex): $FOOTER_LEN_HEX" + + # Get the highest offset from column metadata + echo "" + echo "Examining column chunk offsets from metadata..." + parquet-tools meta test.parquet > meta.txt 2>&1 || true + cat meta.txt + + echo "" + echo "Analyzing offset pattern..." + grep -i "offset" meta.txt || echo "No offset info" + + echo "" + echo "Expected file size based on Parquet metadata:" + echo " If Parquet reader expects $((FILE_SIZE + 78)) bytes," + echo " then column chunks claim offsets beyond actual data" + + echo "" + echo "=== Download the file as artifact for local analysis ===" + ls -lh test.parquet else echo "FAILED: Could not download chunk" fi