From 16b8cf3e52dc730154ace86ee206eac3b787326f Mon Sep 17 00:00:00 2001 From: chrislu Date: Mon, 24 Nov 2025 00:19:25 -0800 Subject: [PATCH] debug: add logging to EOF return path - FOUND ROOT CAUSE! Added logging to the early return path in SeaweedInputStream.read() that returns -1 when position >= contentLength. KEY FINDING: Parquet is trying to read 78 bytes from position 1275, but the file ends at 1275! This proves the Parquet footer metadata has INCORRECT offsets or sizes, making it think there's data at bytes [1275-1353) which don't exist. Since getPos() returned correct values during write (383, 1267), the issue is likely: 1. Parquet 1.16.0 has different footer format/calculation 2. There's a mismatch between write-time and read-time offset calculations 3. Column chunk sizes in footer are off by 78 bytes Next: Investigate if downgrading Parquet or fixing footer size calculations resolves the issue. --- .../seaweedfs/client/SeaweedInputStream.java | 2 ++ .../seaweedfs/client/SeaweedOutputStream.java | 16 +++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/other/java/client/src/main/java/seaweedfs/client/SeaweedInputStream.java b/other/java/client/src/main/java/seaweedfs/client/SeaweedInputStream.java index 0e016ad6e..3ebe5a185 100644 --- a/other/java/client/src/main/java/seaweedfs/client/SeaweedInputStream.java +++ b/other/java/client/src/main/java/seaweedfs/client/SeaweedInputStream.java @@ -119,6 +119,8 @@ public class SeaweedInputStream extends InputStream { throw new IllegalArgumentException("attempting to read from negative offset"); } if (position >= contentLength) { + LOG.warn("[DEBUG-2024] SeaweedInputStream.read() returning EOF: path={} position={} contentLength={} bufRemaining={}", + path, position, contentLength, buf.remaining()); return -1; // Hadoop prefers -1 to EOFException } diff --git a/other/java/client/src/main/java/seaweedfs/client/SeaweedOutputStream.java b/other/java/client/src/main/java/seaweedfs/client/SeaweedOutputStream.java index cf7f74d9c..e44f59b7e 100644 --- a/other/java/client/src/main/java/seaweedfs/client/SeaweedOutputStream.java +++ b/other/java/client/src/main/java/seaweedfs/client/SeaweedOutputStream.java @@ -107,12 +107,13 @@ public class SeaweedOutputStream extends OutputStream { String caller = "unknown"; if (stackTrace.length > 2) { StackTraceElement callerElement = stackTrace[2]; - caller = callerElement.getClassName() + "." + callerElement.getMethodName() + ":" + callerElement.getLineNumber(); + caller = callerElement.getClassName() + "." + callerElement.getMethodName() + ":" + + callerElement.getLineNumber(); } - + LOG.warn( "[DEBUG-2024] getPos() called by {}: flushedPosition={} bufferPosition={} returning={} totalBytesWritten={} writeCalls={} path={}", - caller, position, buffer.position(), currentPos, totalBytesWritten, writeCallCount, + caller, position, buffer.position(), currentPos, totalBytesWritten, writeCallCount, path.substring(Math.max(0, path.length() - 80))); // Last 80 chars of path } return currentPos; @@ -204,7 +205,7 @@ public class SeaweedOutputStream extends OutputStream { int currentOffset = off; int writableBytes = bufferSize - buffer.position(); int numberOfBytesToWrite = length; - + // Track position before write long posBeforeWrite = position + buffer.position(); @@ -219,12 +220,13 @@ public class SeaweedOutputStream extends OutputStream { // ((outputIndex + currentOffset) + writableBytes) + ") " + buffer.capacity()); buffer.put(data, currentOffset, writableBytes); currentOffset += writableBytes; - + if (path.contains("parquet")) { - LOG.warn("[DEBUG-2024] Buffer FLUSH: posBeforeFlush={} flushingBufferSize={} newPositionAfterFlush={} totalWritten={}", + LOG.warn( + "[DEBUG-2024] Buffer FLUSH: posBeforeFlush={} flushingBufferSize={} newPositionAfterFlush={} totalWritten={}", posBeforeWrite, bufferSize, position + bufferSize, totalBytesWritten); } - + writeCurrentBufferToService(); numberOfBytesToWrite = numberOfBytesToWrite - writableBytes; writableBytes = bufferSize - buffer.position();