You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

796 lines
28 KiB

package log_buffer
import (
"bytes"
"math"
"strings"
"sync"
"sync/atomic"
"time"
"google.golang.org/protobuf/proto"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/mq_pb"
"github.com/seaweedfs/seaweedfs/weed/util"
)
const BufferSize = 8 * 1024 * 1024
const PreviousBufferCount = 32
type dataToFlush struct {
startTime time.Time
stopTime time.Time
data *bytes.Buffer
minOffset int64
maxOffset int64
done chan struct{} // Signal when flush completes
}
type EachLogEntryFuncType func(logEntry *filer_pb.LogEntry) (isDone bool, err error)
type EachLogEntryWithOffsetFuncType func(logEntry *filer_pb.LogEntry, offset int64) (isDone bool, err error)
type LogFlushFuncType func(logBuffer *LogBuffer, startTime, stopTime time.Time, buf []byte, minOffset, maxOffset int64)
type LogReadFromDiskFuncType func(startPosition MessagePosition, stopTsNs int64, eachLogEntryFn EachLogEntryFuncType) (lastReadPosition MessagePosition, isDone bool, err error)
type LogBuffer struct {
LastFlushTsNs int64
name string
prevBuffers *SealedBuffers
buf []byte
offset int64 // Last offset in current buffer (endOffset)
bufferStartOffset int64 // First offset in current buffer
idx []int
pos int
startTime time.Time
stopTime time.Time
lastFlushDataTime time.Time
sizeBuf []byte
flushInterval time.Duration
flushFn LogFlushFuncType
ReadFromDiskFn LogReadFromDiskFuncType
notifyFn func()
// Per-subscriber notification channels for instant wake-up
subscribersMu sync.RWMutex
subscribers map[string]chan struct{} // subscriberID -> notification channel
isStopping *atomic.Bool
isAllFlushed bool
flushChan chan *dataToFlush
LastTsNs atomic.Int64
// Offset range tracking for Kafka integration
minOffset int64
maxOffset int64
hasOffsets bool
lastFlushedOffset atomic.Int64 // Highest offset that has been flushed to disk (-1 = nothing flushed yet)
lastFlushedTime atomic.Int64 // Latest timestamp that has been flushed to disk (0 = nothing flushed yet)
sync.RWMutex
}
func NewLogBuffer(name string, flushInterval time.Duration, flushFn LogFlushFuncType,
readFromDiskFn LogReadFromDiskFuncType, notifyFn func()) *LogBuffer {
lb := &LogBuffer{
name: name,
prevBuffers: newSealedBuffers(PreviousBufferCount),
buf: make([]byte, BufferSize),
sizeBuf: make([]byte, 4),
flushInterval: flushInterval,
flushFn: flushFn,
ReadFromDiskFn: readFromDiskFn,
notifyFn: notifyFn,
subscribers: make(map[string]chan struct{}),
flushChan: make(chan *dataToFlush, 256),
isStopping: new(atomic.Bool),
offset: 0, // Will be initialized from existing data if available
}
lb.lastFlushedOffset.Store(-1) // Nothing flushed to disk yet
go lb.loopFlush()
go lb.loopInterval()
return lb
}
// RegisterSubscriber registers a subscriber for instant notifications when data is written
// Returns a channel that will receive notifications (<1ms latency)
func (logBuffer *LogBuffer) RegisterSubscriber(subscriberID string) chan struct{} {
logBuffer.subscribersMu.Lock()
defer logBuffer.subscribersMu.Unlock()
// Check if already registered
if existingChan, exists := logBuffer.subscribers[subscriberID]; exists {
glog.V(2).Infof("Subscriber %s already registered for %s, reusing channel", subscriberID, logBuffer.name)
return existingChan
}
// Create buffered channel (size 1) so notifications never block
notifyChan := make(chan struct{}, 1)
logBuffer.subscribers[subscriberID] = notifyChan
glog.V(1).Infof("Registered subscriber %s for %s (total: %d)", subscriberID, logBuffer.name, len(logBuffer.subscribers))
return notifyChan
}
// UnregisterSubscriber removes a subscriber and closes its notification channel
func (logBuffer *LogBuffer) UnregisterSubscriber(subscriberID string) {
logBuffer.subscribersMu.Lock()
defer logBuffer.subscribersMu.Unlock()
if ch, exists := logBuffer.subscribers[subscriberID]; exists {
close(ch)
delete(logBuffer.subscribers, subscriberID)
glog.V(1).Infof("Unregistered subscriber %s from %s (remaining: %d)", subscriberID, logBuffer.name, len(logBuffer.subscribers))
}
}
// IsOffsetInMemory checks if the given offset is available in the in-memory buffer
// Returns true if:
// 1. Offset is newer than what's been flushed to disk (must be in memory)
// 2. Offset is in current buffer or previous buffers (may be flushed but still in memory)
// Returns false if offset is older than memory buffers (only on disk)
func (logBuffer *LogBuffer) IsOffsetInMemory(offset int64) bool {
logBuffer.RLock()
defer logBuffer.RUnlock()
// Check if we're tracking offsets at all
if !logBuffer.hasOffsets {
return false // No offsets tracked yet
}
// OPTIMIZATION: If offset is newer than what's been flushed to disk,
// it MUST be in memory (not written to disk yet)
lastFlushed := logBuffer.lastFlushedOffset.Load()
if lastFlushed >= 0 && offset > lastFlushed {
glog.V(3).Infof("Offset %d is in memory (newer than lastFlushed=%d)", offset, lastFlushed)
return true
}
// Check if offset is in current buffer range AND buffer has data
// (data can be both on disk AND in memory during flush window)
if offset >= logBuffer.bufferStartOffset && offset <= logBuffer.offset {
// CRITICAL: Check if buffer actually has data (pos > 0)
// After flush, pos=0 but range is still valid - data is on disk, not in memory
if logBuffer.pos > 0 {
glog.V(3).Infof("Offset %d is in current buffer [%d-%d] with data", offset, logBuffer.bufferStartOffset, logBuffer.offset)
return true
}
// Buffer is empty (just flushed) - data is on disk
glog.V(3).Infof("Offset %d in range [%d-%d] but buffer empty (pos=0), data on disk", offset, logBuffer.bufferStartOffset, logBuffer.offset)
return false
}
// Check if offset is in previous buffers AND they have data
for _, buf := range logBuffer.prevBuffers.buffers {
if offset >= buf.startOffset && offset <= buf.offset {
// Check if prevBuffer actually has data
if buf.size > 0 {
glog.V(3).Infof("Offset %d is in previous buffer [%d-%d] with data", offset, buf.startOffset, buf.offset)
return true
}
// Buffer is empty (flushed) - data is on disk
glog.V(3).Infof("Offset %d in prevBuffer [%d-%d] but empty (size=0), data on disk", offset, buf.startOffset, buf.offset)
return false
}
}
// Offset is older than memory buffers - only available on disk
glog.V(3).Infof("Offset %d is NOT in memory (bufferStart=%d, lastFlushed=%d)", offset, logBuffer.bufferStartOffset, lastFlushed)
return false
}
// notifySubscribers sends notifications to all registered subscribers
// Non-blocking: uses select with default to avoid blocking on full channels
func (logBuffer *LogBuffer) notifySubscribers() {
logBuffer.subscribersMu.RLock()
defer logBuffer.subscribersMu.RUnlock()
if len(logBuffer.subscribers) == 0 {
return // No subscribers, skip notification
}
for subscriberID, notifyChan := range logBuffer.subscribers {
select {
case notifyChan <- struct{}{}:
// Notification sent successfully
glog.V(3).Infof("Notified subscriber %s for %s", subscriberID, logBuffer.name)
default:
// Channel full - subscriber hasn't consumed previous notification yet
// This is OK because one notification is sufficient to wake the subscriber
glog.V(3).Infof("Subscriber %s notification channel full (OK - already notified)", subscriberID)
}
}
}
// InitializeOffsetFromExistingData initializes the offset counter from existing data on disk
// This should be called after LogBuffer creation to ensure offset continuity on restart
func (logBuffer *LogBuffer) InitializeOffsetFromExistingData(getHighestOffsetFn func() (int64, error)) error {
if getHighestOffsetFn == nil {
return nil // No initialization function provided
}
highestOffset, err := getHighestOffsetFn()
if err != nil {
glog.V(0).Infof("Failed to get highest offset for %s: %v, starting from 0", logBuffer.name, err)
return nil // Continue with offset 0 if we can't read existing data
}
if highestOffset >= 0 {
// Set the next offset to be one after the highest existing offset
nextOffset := highestOffset + 1
logBuffer.offset = nextOffset
// CRITICAL FIX: bufferStartOffset should match offset after initialization
// This ensures that reads for old offsets (0...highestOffset) will trigger disk reads
// New data written after this will start at nextOffset
logBuffer.bufferStartOffset = nextOffset
// CRITICAL: Track that data [0...highestOffset] is on disk
logBuffer.lastFlushedOffset.Store(highestOffset)
// Set lastFlushedTime to current time (we know data up to highestOffset is on disk)
logBuffer.lastFlushedTime.Store(time.Now().UnixNano())
glog.V(0).Infof("Initialized LogBuffer %s offset to %d (highest existing: %d), buffer starts at %d, lastFlushedOffset=%d, lastFlushedTime=%v",
logBuffer.name, nextOffset, highestOffset, nextOffset, highestOffset, time.Now())
} else {
logBuffer.bufferStartOffset = 0 // Start from offset 0
// No data on disk yet
glog.V(0).Infof("No existing data found for %s, starting from offset 0, lastFlushedOffset=-1, lastFlushedTime=0", logBuffer.name)
}
return nil
}
func (logBuffer *LogBuffer) AddToBuffer(message *mq_pb.DataMessage) {
logBuffer.AddDataToBuffer(message.Key, message.Value, message.TsNs)
}
// AddLogEntryToBuffer directly adds a LogEntry to the buffer, preserving offset information
func (logBuffer *LogBuffer) AddLogEntryToBuffer(logEntry *filer_pb.LogEntry) {
logEntryData, _ := proto.Marshal(logEntry)
var toFlush *dataToFlush
logBuffer.Lock()
defer func() {
logBuffer.Unlock()
if toFlush != nil {
logBuffer.flushChan <- toFlush
}
if logBuffer.notifyFn != nil {
logBuffer.notifyFn()
}
// Notify all registered subscribers instantly (<1ms latency)
logBuffer.notifySubscribers()
}()
processingTsNs := logEntry.TsNs
ts := time.Unix(0, processingTsNs)
// Handle timestamp collision inside lock (rare case)
if logBuffer.LastTsNs.Load() >= processingTsNs {
processingTsNs = logBuffer.LastTsNs.Add(1)
ts = time.Unix(0, processingTsNs)
// Re-marshal with corrected timestamp
logEntry.TsNs = processingTsNs
logEntryData, _ = proto.Marshal(logEntry)
} else {
logBuffer.LastTsNs.Store(processingTsNs)
}
size := len(logEntryData)
if logBuffer.pos == 0 {
logBuffer.startTime = ts
// Reset offset tracking for new buffer
logBuffer.hasOffsets = false
}
// Track offset ranges for Kafka integration
// CRITICAL FIX: Use >= 0 to include offset 0 (first message in a topic)
if logEntry.Offset >= 0 {
if !logBuffer.hasOffsets {
logBuffer.minOffset = logEntry.Offset
logBuffer.maxOffset = logEntry.Offset
logBuffer.hasOffsets = true
} else {
if logEntry.Offset < logBuffer.minOffset {
logBuffer.minOffset = logEntry.Offset
}
if logEntry.Offset > logBuffer.maxOffset {
logBuffer.maxOffset = logEntry.Offset
}
}
}
if logBuffer.startTime.Add(logBuffer.flushInterval).Before(ts) || len(logBuffer.buf)-logBuffer.pos < size+4 {
toFlush = logBuffer.copyToFlush()
logBuffer.startTime = ts
if len(logBuffer.buf) < size+4 {
// Validate size to prevent integer overflow in computation BEFORE allocation
const maxBufferSize = 1 << 30 // 1 GiB practical limit
// Ensure 2*size + 4 won't overflow int and stays within practical bounds
if size < 0 || size > (math.MaxInt-4)/2 || size > (maxBufferSize-4)/2 {
glog.Errorf("Buffer size out of valid range: %d bytes, skipping", size)
return
}
// Safe to compute now that we've validated size is in valid range
newSize := 2*size + 4
logBuffer.buf = make([]byte, newSize)
}
}
logBuffer.stopTime = ts
logBuffer.idx = append(logBuffer.idx, logBuffer.pos)
util.Uint32toBytes(logBuffer.sizeBuf, uint32(size))
copy(logBuffer.buf[logBuffer.pos:logBuffer.pos+4], logBuffer.sizeBuf)
copy(logBuffer.buf[logBuffer.pos+4:logBuffer.pos+4+size], logEntryData)
logBuffer.pos += size + 4
logBuffer.offset++
}
func (logBuffer *LogBuffer) AddDataToBuffer(partitionKey, data []byte, processingTsNs int64) {
// PERFORMANCE OPTIMIZATION: Pre-process expensive operations OUTSIDE the lock
var ts time.Time
if processingTsNs == 0 {
ts = time.Now()
processingTsNs = ts.UnixNano()
} else {
ts = time.Unix(0, processingTsNs)
}
logEntry := &filer_pb.LogEntry{
TsNs: processingTsNs, // Will be updated if needed
PartitionKeyHash: util.HashToInt32(partitionKey),
Data: data,
Key: partitionKey,
}
logEntryData, _ := proto.Marshal(logEntry)
var toFlush *dataToFlush
logBuffer.Lock()
defer func() {
logBuffer.Unlock()
if toFlush != nil {
logBuffer.flushChan <- toFlush
}
if logBuffer.notifyFn != nil {
logBuffer.notifyFn()
}
// Notify all registered subscribers instantly (<1ms latency)
logBuffer.notifySubscribers()
}()
// Handle timestamp collision inside lock (rare case)
if logBuffer.LastTsNs.Load() >= processingTsNs {
processingTsNs = logBuffer.LastTsNs.Add(1)
ts = time.Unix(0, processingTsNs)
// Re-marshal with corrected timestamp
logEntry.TsNs = processingTsNs
logEntryData, _ = proto.Marshal(logEntry)
} else {
logBuffer.LastTsNs.Store(processingTsNs)
}
size := len(logEntryData)
if logBuffer.pos == 0 {
logBuffer.startTime = ts
}
if logBuffer.startTime.Add(logBuffer.flushInterval).Before(ts) || len(logBuffer.buf)-logBuffer.pos < size+4 {
// glog.V(0).Infof("%s copyToFlush1 offset:%d count:%d start time %v, ts %v, remaining %d bytes", logBuffer.name, logBuffer.offset, len(logBuffer.idx), logBuffer.startTime, ts, len(logBuffer.buf)-logBuffer.pos)
toFlush = logBuffer.copyToFlush()
logBuffer.startTime = ts
if len(logBuffer.buf) < size+4 {
// Validate size to prevent integer overflow in computation BEFORE allocation
const maxBufferSize = 1 << 30 // 1 GiB practical limit
// Ensure 2*size + 4 won't overflow int and stays within practical bounds
if size < 0 || size > (math.MaxInt-4)/2 || size > (maxBufferSize-4)/2 {
glog.Errorf("Buffer size out of valid range: %d bytes, skipping", size)
return
}
// Safe to compute now that we've validated size is in valid range
newSize := 2*size + 4
logBuffer.buf = make([]byte, newSize)
}
}
logBuffer.stopTime = ts
logBuffer.idx = append(logBuffer.idx, logBuffer.pos)
util.Uint32toBytes(logBuffer.sizeBuf, uint32(size))
copy(logBuffer.buf[logBuffer.pos:logBuffer.pos+4], logBuffer.sizeBuf)
copy(logBuffer.buf[logBuffer.pos+4:logBuffer.pos+4+size], logEntryData)
logBuffer.pos += size + 4
}
func (logBuffer *LogBuffer) IsStopping() bool {
return logBuffer.isStopping.Load()
}
// ForceFlush immediately flushes the current buffer content and WAITS for completion
// This is useful for critical topics that need immediate persistence
// CRITICAL: This function is now SYNCHRONOUS - it blocks until the flush completes
func (logBuffer *LogBuffer) ForceFlush() {
if logBuffer.isStopping.Load() {
return // Don't flush if we're shutting down
}
logBuffer.Lock()
toFlush := logBuffer.copyToFlushWithCallback()
logBuffer.Unlock()
if toFlush != nil {
// Send to flush channel (with reasonable timeout)
select {
case logBuffer.flushChan <- toFlush:
// Successfully queued for flush - now WAIT for it to complete
select {
case <-toFlush.done:
// Flush completed successfully
glog.V(1).Infof("ForceFlush completed for %s", logBuffer.name)
case <-time.After(5 * time.Second):
// Timeout waiting for flush - this shouldn't happen
glog.Warningf("ForceFlush timed out waiting for completion on %s", logBuffer.name)
}
case <-time.After(2 * time.Second):
// If flush channel is still blocked after 2s, something is wrong
glog.Warningf("ForceFlush channel timeout for %s - flush channel busy for 2s", logBuffer.name)
}
}
}
// ShutdownLogBuffer flushes the buffer and stops the log buffer
func (logBuffer *LogBuffer) ShutdownLogBuffer() {
isAlreadyStopped := logBuffer.isStopping.Swap(true)
if isAlreadyStopped {
return
}
toFlush := logBuffer.copyToFlush()
logBuffer.flushChan <- toFlush
close(logBuffer.flushChan)
}
// IsAllFlushed returns true if all data in the buffer has been flushed, after calling ShutdownLogBuffer().
func (logBuffer *LogBuffer) IsAllFlushed() bool {
return logBuffer.isAllFlushed
}
func (logBuffer *LogBuffer) loopFlush() {
for d := range logBuffer.flushChan {
if d != nil {
// glog.V(4).Infof("%s flush [%v, %v] size %d", m.name, d.startTime, d.stopTime, len(d.data.Bytes()))
logBuffer.flushFn(logBuffer, d.startTime, d.stopTime, d.data.Bytes(), d.minOffset, d.maxOffset)
d.releaseMemory()
// local logbuffer is different from aggregate logbuffer here
logBuffer.lastFlushDataTime = d.stopTime
// CRITICAL: Track what's been flushed to disk for both offset-based and time-based reads
// CRITICAL FIX: Use >= 0 to include offset 0 (first message in a topic)
if d.maxOffset >= 0 {
logBuffer.lastFlushedOffset.Store(d.maxOffset)
}
if !d.stopTime.IsZero() {
logBuffer.lastFlushedTime.Store(d.stopTime.UnixNano())
}
// Signal completion if there's a callback channel
if d.done != nil {
close(d.done)
}
}
}
logBuffer.isAllFlushed = true
}
func (logBuffer *LogBuffer) loopInterval() {
for !logBuffer.IsStopping() {
time.Sleep(logBuffer.flushInterval)
if logBuffer.IsStopping() {
return
}
logBuffer.Lock()
toFlush := logBuffer.copyToFlush()
logBuffer.Unlock()
if toFlush != nil {
glog.V(4).Infof("%s flush [%v, %v] size %d", logBuffer.name, toFlush.startTime, toFlush.stopTime, len(toFlush.data.Bytes()))
logBuffer.flushChan <- toFlush
} else {
// glog.V(0).Infof("%s no flush", m.name)
}
}
}
func (logBuffer *LogBuffer) copyToFlush() *dataToFlush {
return logBuffer.copyToFlushInternal(false)
}
func (logBuffer *LogBuffer) copyToFlushWithCallback() *dataToFlush {
return logBuffer.copyToFlushInternal(true)
}
func (logBuffer *LogBuffer) copyToFlushInternal(withCallback bool) *dataToFlush {
if logBuffer.pos > 0 {
var d *dataToFlush
if logBuffer.flushFn != nil {
d = &dataToFlush{
startTime: logBuffer.startTime,
stopTime: logBuffer.stopTime,
data: copiedBytes(logBuffer.buf[:logBuffer.pos]),
minOffset: logBuffer.minOffset,
maxOffset: logBuffer.maxOffset,
}
// Add callback channel for synchronous ForceFlush
if withCallback {
d.done = make(chan struct{})
}
// glog.V(4).Infof("%s flushing [0,%d) with %d entries [%v, %v]", m.name, m.pos, len(m.idx), m.startTime, m.stopTime)
} else {
// glog.V(4).Infof("%s removed from memory [0,%d) with %d entries [%v, %v]", m.name, m.pos, len(m.idx), m.startTime, m.stopTime)
logBuffer.lastFlushDataTime = logBuffer.stopTime
}
// CRITICAL: logBuffer.offset is the "next offset to assign", so last offset in buffer is offset-1
lastOffsetInBuffer := logBuffer.offset - 1
logBuffer.buf = logBuffer.prevBuffers.SealBuffer(logBuffer.startTime, logBuffer.stopTime, logBuffer.buf, logBuffer.pos, logBuffer.bufferStartOffset, lastOffsetInBuffer)
logBuffer.startTime = time.Unix(0, 0)
logBuffer.stopTime = time.Unix(0, 0)
logBuffer.pos = 0
logBuffer.idx = logBuffer.idx[:0]
// DON'T increment offset - it's already pointing to the next offset!
// logBuffer.offset++ // REMOVED - this was causing offset gaps!
logBuffer.bufferStartOffset = logBuffer.offset // Next buffer starts at current offset (which is already the next one)
// Reset offset tracking
logBuffer.hasOffsets = false
logBuffer.minOffset = 0
logBuffer.maxOffset = 0
return d
}
return nil
}
func (logBuffer *LogBuffer) GetEarliestTime() time.Time {
return logBuffer.startTime
}
func (logBuffer *LogBuffer) GetEarliestPosition() MessagePosition {
return MessagePosition{
Time: logBuffer.startTime,
Offset: logBuffer.offset,
}
}
func (d *dataToFlush) releaseMemory() {
d.data.Reset()
bufferPool.Put(d.data)
}
func (logBuffer *LogBuffer) ReadFromBuffer(lastReadPosition MessagePosition) (bufferCopy *bytes.Buffer, batchIndex int64, err error) {
logBuffer.RLock()
defer logBuffer.RUnlock()
isOffsetBased := lastReadPosition.IsOffsetBased
// CRITICAL FIX: For offset-based subscriptions, use offset comparisons, not time comparisons!
if isOffsetBased {
requestedOffset := lastReadPosition.Offset
// DEBUG: Log buffer state for _schemas topic
if strings.Contains(logBuffer.name, "_schemas") {
glog.Infof("[SCHEMAS ReadFromBuffer] requested=%d bufferStart=%d bufferEnd=%d pos=%d lastFlushed=%d",
requestedOffset, logBuffer.bufferStartOffset, logBuffer.offset, logBuffer.pos, logBuffer.lastFlushedOffset.Load())
}
// Check if the requested offset is in the current buffer range
if requestedOffset >= logBuffer.bufferStartOffset && requestedOffset <= logBuffer.offset {
// If current buffer is empty (pos=0), check if data is on disk or not yet written
if logBuffer.pos == 0 {
// CRITICAL FIX: If buffer is empty but offset range covers the request,
// it means data was in memory and has been flushed/moved out.
// The bufferStartOffset advancing to cover this offset proves data existed.
//
// Three cases:
// 1. requestedOffset < logBuffer.offset: Data was here, now flushed
// 2. requestedOffset == logBuffer.offset && bufferStartOffset > 0: Buffer advanced, data flushed
// 3. requestedOffset == logBuffer.offset && bufferStartOffset == 0: Initial state - try disk first!
//
// Cases 1 & 2: try disk read
// Case 3: try disk read (historical data might exist)
if requestedOffset < logBuffer.offset {
// Data was in the buffer range but buffer is now empty = flushed to disk
if strings.Contains(logBuffer.name, "_schemas") {
glog.Infof("[SCHEMAS ReadFromBuffer] Returning ResumeFromDiskError: empty buffer, offset %d was flushed (bufferStart=%d, offset=%d)",
requestedOffset, logBuffer.bufferStartOffset, logBuffer.offset)
}
return nil, -2, ResumeFromDiskError
}
// requestedOffset == logBuffer.offset: Current position
// CRITICAL: For subscribers starting from offset 0, try disk read first
// (historical data might exist from previous runs)
if requestedOffset == 0 && logBuffer.bufferStartOffset == 0 && logBuffer.offset == 0 {
// Initial state: try disk read before waiting for new data
if strings.Contains(logBuffer.name, "_schemas") {
glog.Infof("[SCHEMAS ReadFromBuffer] Initial state, trying disk read for offset 0")
}
return nil, -2, ResumeFromDiskError
}
// Otherwise, wait for new data to arrive
if strings.Contains(logBuffer.name, "_schemas") {
glog.Infof("[SCHEMAS ReadFromBuffer] Returning nil: waiting for offset %d to arrive", requestedOffset)
}
return nil, logBuffer.offset, nil
}
if strings.Contains(logBuffer.name, "_schemas") {
glog.Infof("[SCHEMAS ReadFromBuffer] Returning %d bytes from buffer", logBuffer.pos)
}
return copiedBytes(logBuffer.buf[:logBuffer.pos]), logBuffer.offset, nil
}
// Check previous buffers for the requested offset
for _, buf := range logBuffer.prevBuffers.buffers {
if requestedOffset >= buf.startOffset && requestedOffset <= buf.offset {
// If prevBuffer is empty, it means the data was flushed to disk
// (prevBuffers are created when buffer is flushed)
if buf.size == 0 {
// Empty prevBuffer covering this offset means data was flushed
return nil, -2, ResumeFromDiskError
}
return copiedBytes(buf.buf[:buf.size]), buf.offset, nil
}
}
// Offset not found in any buffer
if requestedOffset < logBuffer.bufferStartOffset {
// Data not in current buffers - must be on disk (flushed or never existed)
// Return ResumeFromDiskError to trigger disk read
return nil, -2, ResumeFromDiskError
}
if requestedOffset > logBuffer.offset {
// Future data, not available yet
return nil, logBuffer.offset, nil
}
// Offset not found - return nil
return nil, logBuffer.offset, nil
}
// TIMESTAMP-BASED READ (original logic)
// Read from disk and memory
// 1. read from disk, last time is = td
// 2. in memory, the earliest time = tm
// if tm <= td, case 2.1
// read from memory
// if tm is empty, case 2.2
// read from memory
// if td < tm, case 2.3
// read from disk again
var tsMemory time.Time
var tsBatchIndex int64
if !logBuffer.startTime.IsZero() {
tsMemory = logBuffer.startTime
tsBatchIndex = logBuffer.offset
}
for _, prevBuf := range logBuffer.prevBuffers.buffers {
if !prevBuf.startTime.IsZero() && prevBuf.startTime.Before(tsMemory) {
tsMemory = prevBuf.startTime
tsBatchIndex = prevBuf.offset
}
}
if tsMemory.IsZero() { // case 2.2
return nil, -2, nil
} else if lastReadPosition.Time.Before(tsMemory) && lastReadPosition.Offset+1 < tsBatchIndex { // case 2.3
// Special case: If requested time is zero (Unix epoch), treat as "start from beginning"
// This handles queries that want to read all data without knowing the exact start time
if lastReadPosition.Time.IsZero() || lastReadPosition.Time.Unix() == 0 {
// Start from the beginning of memory
// Fall through to case 2.1 to read from earliest buffer
} else {
// Data not in memory buffers - read from disk
glog.V(0).Infof("resume from disk: requested time %v < earliest memory time %v",
lastReadPosition.Time, tsMemory)
return nil, -2, ResumeFromDiskError
}
}
// the following is case 2.1
if lastReadPosition.Time.Equal(logBuffer.stopTime) {
return nil, logBuffer.offset, nil
}
if lastReadPosition.Time.After(logBuffer.stopTime) {
// glog.Fatalf("unexpected last read time %v, older than latest %v", lastReadPosition, m.stopTime)
return nil, logBuffer.offset, nil
}
if lastReadPosition.Time.Before(logBuffer.startTime) {
for _, buf := range logBuffer.prevBuffers.buffers {
if buf.startTime.After(lastReadPosition.Time) {
// glog.V(4).Infof("%s return the %d sealed buffer %v", m.name, i, buf.startTime)
return copiedBytes(buf.buf[:buf.size]), buf.offset, nil
}
if !buf.startTime.After(lastReadPosition.Time) && buf.stopTime.After(lastReadPosition.Time) {
pos := buf.locateByTs(lastReadPosition.Time)
return copiedBytes(buf.buf[pos:buf.size]), buf.offset, nil
}
}
// glog.V(4).Infof("%s return the current buf %v", m.name, lastReadPosition)
return copiedBytes(logBuffer.buf[:logBuffer.pos]), logBuffer.offset, nil
}
lastTs := lastReadPosition.Time.UnixNano()
l, h := 0, len(logBuffer.idx)-1
/*
for i, pos := range m.idx {
logEntry, ts := readTs(m.buf, pos)
event := &filer_pb.SubscribeMetadataResponse{}
proto.Unmarshal(logEntry.Data, event)
entry := event.EventNotification.OldEntry
if entry == nil {
entry = event.EventNotification.NewEntry
}
}
*/
for l <= h {
mid := (l + h) / 2
pos := logBuffer.idx[mid]
_, t := readTs(logBuffer.buf, pos)
if t <= lastTs {
l = mid + 1
} else if lastTs < t {
var prevT int64
if mid > 0 {
_, prevT = readTs(logBuffer.buf, logBuffer.idx[mid-1])
}
if prevT <= lastTs {
return copiedBytes(logBuffer.buf[pos:logBuffer.pos]), logBuffer.offset, nil
}
h = mid
}
}
// Binary search didn't find the timestamp - data may have been flushed to disk already
// Returning -2 signals to caller that data is not available in memory
return nil, -2, nil
}
func (logBuffer *LogBuffer) ReleaseMemory(b *bytes.Buffer) {
bufferPool.Put(b)
}
// GetName returns the log buffer name for metadata tracking
func (logBuffer *LogBuffer) GetName() string {
logBuffer.RLock()
defer logBuffer.RUnlock()
return logBuffer.name
}
// GetOffset returns the current offset for metadata tracking
func (logBuffer *LogBuffer) GetOffset() int64 {
logBuffer.RLock()
defer logBuffer.RUnlock()
return logBuffer.offset
}
var bufferPool = sync.Pool{
New: func() interface{} {
return new(bytes.Buffer)
},
}
func copiedBytes(buf []byte) (copied *bytes.Buffer) {
copied = bufferPool.Get().(*bytes.Buffer)
copied.Reset()
copied.Write(buf)
return
}
func readTs(buf []byte, pos int) (size int, ts int64) {
size = int(util.BytesToUint32(buf[pos : pos+4]))
entryData := buf[pos+4 : pos+4+size]
logEntry := &filer_pb.LogEntry{}
err := proto.Unmarshal(entryData, logEntry)
if err != nil {
glog.Fatalf("unexpected unmarshal filer_pb.LogEntry: %v", err)
}
return size, logEntry.TsNs
}