From 7b067d2e596fcfbe7b7ae38ca3ae020286548b27 Mon Sep 17 00:00:00 2001 From: chrislu Date: Sun, 23 Nov 2025 12:08:23 -0800 Subject: [PATCH] debug: add getPos() method to track position queries Added getPos() to SeaweedOutputStream to understand when and how Hadoop/Parquet queries the output stream position. Current mystery: - Files are written correctly (totalBytesWritten=position=chunks) - But Parquet expects 78 more bytes when reading - year=2020: wrote 696, expects 774 (missing 78) - year=2021: wrote 684, expects 762 (missing 78) The consistent 78-byte discrepancy suggests either: A) Parquet calculates row group size before finalizing footer B) FSDataOutputStream tracks position differently than our stream C) Footer is written with stale/incorrect metadata D) File size is cached/stale during rename operation getPos() logging will show if Parquet/Hadoop queries position and what value is returned vs what was actually written. --- .../seaweedfs/client/SeaweedOutputStream.java | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/other/java/client/src/main/java/seaweedfs/client/SeaweedOutputStream.java b/other/java/client/src/main/java/seaweedfs/client/SeaweedOutputStream.java index a30c52834..b0d79b26c 100644 --- a/other/java/client/src/main/java/seaweedfs/client/SeaweedOutputStream.java +++ b/other/java/client/src/main/java/seaweedfs/client/SeaweedOutputStream.java @@ -34,7 +34,7 @@ public class SeaweedOutputStream extends OutputStream { private long outputIndex; private String replication = ""; private String collection = ""; - private long totalBytesWritten = 0; // Track total bytes for debugging + private long totalBytesWritten = 0; // Track total bytes for debugging public SeaweedOutputStream(FilerClient filerClient, final String fullpath) { this(filerClient, fullpath, ""); @@ -90,6 +90,19 @@ public class SeaweedOutputStream extends OutputStream { this.collection = collection; } + /** + * Get the current position in the output stream. + * This returns the total position including both flushed and buffered data. + * + * @return current position (flushed + buffered bytes) + */ + public synchronized long getPos() { + long currentPos = position + buffer.position(); + LOG.info("[DEBUG-2024] 📍 getPos() called: flushedPosition={} bufferPosition={} returning={} path={}", + position, buffer.position(), currentPos, path); + return currentPos; + } + public static String getParentDirectory(String path) { int protoIndex = path.indexOf("://"); if (protoIndex >= 0) { @@ -207,7 +220,7 @@ public class SeaweedOutputStream extends OutputStream { } int bufferPosBeforeFlush = buffer.position(); - LOG.info("[DEBUG-2024] 🔒 close START: path={} position={} buffer.position()={} totalBytesWritten={}", + LOG.info("[DEBUG-2024] 🔒 close START: path={} position={} buffer.position()={} totalBytesWritten={}", path, position, bufferPosBeforeFlush, totalBytesWritten); try { flushInternal();