From c10ae054b60150d48d4970e444e60fc5f0c76b74 Mon Sep 17 00:00:00 2001 From: chrislu Date: Sun, 23 Nov 2025 23:11:40 -0800 Subject: [PATCH] debug: add logging to SeaweedInputStream constructor to track contentLength MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CRITICAL FINDING: File is PERFECT but Spark fails to read it! The downloaded Parquet file (1275 bytes): - ✅ Valid header/trailer (PAR1) - ✅ Complete metadata - ✅ parquet-tools reads it successfully (all 4 rows) - ❌ Spark gets 'Still have: 78 bytes left' EOF error This proves the bug is in READING, not writing! Hypothesis: SeaweedInputStream.contentLength is set to 1197 (1275-78) instead of 1275 when opening the file for reading. Adding WARN logs to track: - When SeaweedInputStream is created - What contentLength is calculated as - How many chunks the entry has This will show if the metadata is being read incorrectly when Spark opens the file, causing contentLength to be 78 bytes short. --- .../src/main/java/seaweedfs/client/SeaweedInputStream.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/other/java/client/src/main/java/seaweedfs/client/SeaweedInputStream.java b/other/java/client/src/main/java/seaweedfs/client/SeaweedInputStream.java index 64754321b..cdc16a9bf 100644 --- a/other/java/client/src/main/java/seaweedfs/client/SeaweedInputStream.java +++ b/other/java/client/src/main/java/seaweedfs/client/SeaweedInputStream.java @@ -44,6 +44,8 @@ public class SeaweedInputStream extends InputStream { } this.contentLength = SeaweedRead.fileSize(entry); + LOG.warn("[DEBUG-2024] SeaweedInputStream created (from fullpath): path={} contentLength={} #chunks={}", + fullpath, this.contentLength, entry.getChunksCount()); this.visibleIntervalList = SeaweedRead.nonOverlappingVisibleIntervals(filerClient, entry.getChunksList()); @@ -64,6 +66,8 @@ public class SeaweedInputStream extends InputStream { } this.contentLength = SeaweedRead.fileSize(entry); + LOG.warn("[DEBUG-2024] SeaweedInputStream created (from entry): path={} contentLength={} #chunks={}", + path, this.contentLength, entry.getChunksCount()); this.visibleIntervalList = SeaweedRead.nonOverlappingVisibleIntervals(filerClient, entry.getChunksList());