You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
796 lines
28 KiB
796 lines
28 KiB
package log_buffer
|
|
|
|
import (
|
|
"bytes"
|
|
"math"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"google.golang.org/protobuf/proto"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/mq_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/util"
|
|
)
|
|
|
|
const BufferSize = 8 * 1024 * 1024
|
|
const PreviousBufferCount = 32
|
|
|
|
type dataToFlush struct {
|
|
startTime time.Time
|
|
stopTime time.Time
|
|
data *bytes.Buffer
|
|
minOffset int64
|
|
maxOffset int64
|
|
done chan struct{} // Signal when flush completes
|
|
}
|
|
|
|
type EachLogEntryFuncType func(logEntry *filer_pb.LogEntry) (isDone bool, err error)
|
|
type EachLogEntryWithOffsetFuncType func(logEntry *filer_pb.LogEntry, offset int64) (isDone bool, err error)
|
|
type LogFlushFuncType func(logBuffer *LogBuffer, startTime, stopTime time.Time, buf []byte, minOffset, maxOffset int64)
|
|
type LogReadFromDiskFuncType func(startPosition MessagePosition, stopTsNs int64, eachLogEntryFn EachLogEntryFuncType) (lastReadPosition MessagePosition, isDone bool, err error)
|
|
|
|
type LogBuffer struct {
|
|
LastFlushTsNs int64
|
|
name string
|
|
prevBuffers *SealedBuffers
|
|
buf []byte
|
|
offset int64 // Last offset in current buffer (endOffset)
|
|
bufferStartOffset int64 // First offset in current buffer
|
|
idx []int
|
|
pos int
|
|
startTime time.Time
|
|
stopTime time.Time
|
|
lastFlushDataTime time.Time
|
|
sizeBuf []byte
|
|
flushInterval time.Duration
|
|
flushFn LogFlushFuncType
|
|
ReadFromDiskFn LogReadFromDiskFuncType
|
|
notifyFn func()
|
|
// Per-subscriber notification channels for instant wake-up
|
|
subscribersMu sync.RWMutex
|
|
subscribers map[string]chan struct{} // subscriberID -> notification channel
|
|
isStopping *atomic.Bool
|
|
isAllFlushed bool
|
|
flushChan chan *dataToFlush
|
|
LastTsNs atomic.Int64
|
|
// Offset range tracking for Kafka integration
|
|
minOffset int64
|
|
maxOffset int64
|
|
hasOffsets bool
|
|
lastFlushedOffset atomic.Int64 // Highest offset that has been flushed to disk (-1 = nothing flushed yet)
|
|
lastFlushedTime atomic.Int64 // Latest timestamp that has been flushed to disk (0 = nothing flushed yet)
|
|
sync.RWMutex
|
|
}
|
|
|
|
func NewLogBuffer(name string, flushInterval time.Duration, flushFn LogFlushFuncType,
|
|
readFromDiskFn LogReadFromDiskFuncType, notifyFn func()) *LogBuffer {
|
|
lb := &LogBuffer{
|
|
name: name,
|
|
prevBuffers: newSealedBuffers(PreviousBufferCount),
|
|
buf: make([]byte, BufferSize),
|
|
sizeBuf: make([]byte, 4),
|
|
flushInterval: flushInterval,
|
|
flushFn: flushFn,
|
|
ReadFromDiskFn: readFromDiskFn,
|
|
notifyFn: notifyFn,
|
|
subscribers: make(map[string]chan struct{}),
|
|
flushChan: make(chan *dataToFlush, 256),
|
|
isStopping: new(atomic.Bool),
|
|
offset: 0, // Will be initialized from existing data if available
|
|
}
|
|
lb.lastFlushedOffset.Store(-1) // Nothing flushed to disk yet
|
|
go lb.loopFlush()
|
|
go lb.loopInterval()
|
|
return lb
|
|
}
|
|
|
|
// RegisterSubscriber registers a subscriber for instant notifications when data is written
|
|
// Returns a channel that will receive notifications (<1ms latency)
|
|
func (logBuffer *LogBuffer) RegisterSubscriber(subscriberID string) chan struct{} {
|
|
logBuffer.subscribersMu.Lock()
|
|
defer logBuffer.subscribersMu.Unlock()
|
|
|
|
// Check if already registered
|
|
if existingChan, exists := logBuffer.subscribers[subscriberID]; exists {
|
|
glog.V(2).Infof("Subscriber %s already registered for %s, reusing channel", subscriberID, logBuffer.name)
|
|
return existingChan
|
|
}
|
|
|
|
// Create buffered channel (size 1) so notifications never block
|
|
notifyChan := make(chan struct{}, 1)
|
|
logBuffer.subscribers[subscriberID] = notifyChan
|
|
glog.V(1).Infof("Registered subscriber %s for %s (total: %d)", subscriberID, logBuffer.name, len(logBuffer.subscribers))
|
|
return notifyChan
|
|
}
|
|
|
|
// UnregisterSubscriber removes a subscriber and closes its notification channel
|
|
func (logBuffer *LogBuffer) UnregisterSubscriber(subscriberID string) {
|
|
logBuffer.subscribersMu.Lock()
|
|
defer logBuffer.subscribersMu.Unlock()
|
|
|
|
if ch, exists := logBuffer.subscribers[subscriberID]; exists {
|
|
close(ch)
|
|
delete(logBuffer.subscribers, subscriberID)
|
|
glog.V(1).Infof("Unregistered subscriber %s from %s (remaining: %d)", subscriberID, logBuffer.name, len(logBuffer.subscribers))
|
|
}
|
|
}
|
|
|
|
// IsOffsetInMemory checks if the given offset is available in the in-memory buffer
|
|
// Returns true if:
|
|
// 1. Offset is newer than what's been flushed to disk (must be in memory)
|
|
// 2. Offset is in current buffer or previous buffers (may be flushed but still in memory)
|
|
// Returns false if offset is older than memory buffers (only on disk)
|
|
func (logBuffer *LogBuffer) IsOffsetInMemory(offset int64) bool {
|
|
logBuffer.RLock()
|
|
defer logBuffer.RUnlock()
|
|
|
|
// Check if we're tracking offsets at all
|
|
if !logBuffer.hasOffsets {
|
|
return false // No offsets tracked yet
|
|
}
|
|
|
|
// OPTIMIZATION: If offset is newer than what's been flushed to disk,
|
|
// it MUST be in memory (not written to disk yet)
|
|
lastFlushed := logBuffer.lastFlushedOffset.Load()
|
|
if lastFlushed >= 0 && offset > lastFlushed {
|
|
glog.V(3).Infof("Offset %d is in memory (newer than lastFlushed=%d)", offset, lastFlushed)
|
|
return true
|
|
}
|
|
|
|
// Check if offset is in current buffer range AND buffer has data
|
|
// (data can be both on disk AND in memory during flush window)
|
|
if offset >= logBuffer.bufferStartOffset && offset <= logBuffer.offset {
|
|
// CRITICAL: Check if buffer actually has data (pos > 0)
|
|
// After flush, pos=0 but range is still valid - data is on disk, not in memory
|
|
if logBuffer.pos > 0 {
|
|
glog.V(3).Infof("Offset %d is in current buffer [%d-%d] with data", offset, logBuffer.bufferStartOffset, logBuffer.offset)
|
|
return true
|
|
}
|
|
// Buffer is empty (just flushed) - data is on disk
|
|
glog.V(3).Infof("Offset %d in range [%d-%d] but buffer empty (pos=0), data on disk", offset, logBuffer.bufferStartOffset, logBuffer.offset)
|
|
return false
|
|
}
|
|
|
|
// Check if offset is in previous buffers AND they have data
|
|
for _, buf := range logBuffer.prevBuffers.buffers {
|
|
if offset >= buf.startOffset && offset <= buf.offset {
|
|
// Check if prevBuffer actually has data
|
|
if buf.size > 0 {
|
|
glog.V(3).Infof("Offset %d is in previous buffer [%d-%d] with data", offset, buf.startOffset, buf.offset)
|
|
return true
|
|
}
|
|
// Buffer is empty (flushed) - data is on disk
|
|
glog.V(3).Infof("Offset %d in prevBuffer [%d-%d] but empty (size=0), data on disk", offset, buf.startOffset, buf.offset)
|
|
return false
|
|
}
|
|
}
|
|
|
|
// Offset is older than memory buffers - only available on disk
|
|
glog.V(3).Infof("Offset %d is NOT in memory (bufferStart=%d, lastFlushed=%d)", offset, logBuffer.bufferStartOffset, lastFlushed)
|
|
return false
|
|
}
|
|
|
|
// notifySubscribers sends notifications to all registered subscribers
|
|
// Non-blocking: uses select with default to avoid blocking on full channels
|
|
func (logBuffer *LogBuffer) notifySubscribers() {
|
|
logBuffer.subscribersMu.RLock()
|
|
defer logBuffer.subscribersMu.RUnlock()
|
|
|
|
if len(logBuffer.subscribers) == 0 {
|
|
return // No subscribers, skip notification
|
|
}
|
|
|
|
for subscriberID, notifyChan := range logBuffer.subscribers {
|
|
select {
|
|
case notifyChan <- struct{}{}:
|
|
// Notification sent successfully
|
|
glog.V(3).Infof("Notified subscriber %s for %s", subscriberID, logBuffer.name)
|
|
default:
|
|
// Channel full - subscriber hasn't consumed previous notification yet
|
|
// This is OK because one notification is sufficient to wake the subscriber
|
|
glog.V(3).Infof("Subscriber %s notification channel full (OK - already notified)", subscriberID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// InitializeOffsetFromExistingData initializes the offset counter from existing data on disk
|
|
// This should be called after LogBuffer creation to ensure offset continuity on restart
|
|
func (logBuffer *LogBuffer) InitializeOffsetFromExistingData(getHighestOffsetFn func() (int64, error)) error {
|
|
if getHighestOffsetFn == nil {
|
|
return nil // No initialization function provided
|
|
}
|
|
|
|
highestOffset, err := getHighestOffsetFn()
|
|
if err != nil {
|
|
glog.V(0).Infof("Failed to get highest offset for %s: %v, starting from 0", logBuffer.name, err)
|
|
return nil // Continue with offset 0 if we can't read existing data
|
|
}
|
|
|
|
if highestOffset >= 0 {
|
|
// Set the next offset to be one after the highest existing offset
|
|
nextOffset := highestOffset + 1
|
|
logBuffer.offset = nextOffset
|
|
// CRITICAL FIX: bufferStartOffset should match offset after initialization
|
|
// This ensures that reads for old offsets (0...highestOffset) will trigger disk reads
|
|
// New data written after this will start at nextOffset
|
|
logBuffer.bufferStartOffset = nextOffset
|
|
// CRITICAL: Track that data [0...highestOffset] is on disk
|
|
logBuffer.lastFlushedOffset.Store(highestOffset)
|
|
// Set lastFlushedTime to current time (we know data up to highestOffset is on disk)
|
|
logBuffer.lastFlushedTime.Store(time.Now().UnixNano())
|
|
glog.V(0).Infof("Initialized LogBuffer %s offset to %d (highest existing: %d), buffer starts at %d, lastFlushedOffset=%d, lastFlushedTime=%v",
|
|
logBuffer.name, nextOffset, highestOffset, nextOffset, highestOffset, time.Now())
|
|
} else {
|
|
logBuffer.bufferStartOffset = 0 // Start from offset 0
|
|
// No data on disk yet
|
|
glog.V(0).Infof("No existing data found for %s, starting from offset 0, lastFlushedOffset=-1, lastFlushedTime=0", logBuffer.name)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (logBuffer *LogBuffer) AddToBuffer(message *mq_pb.DataMessage) {
|
|
logBuffer.AddDataToBuffer(message.Key, message.Value, message.TsNs)
|
|
}
|
|
|
|
// AddLogEntryToBuffer directly adds a LogEntry to the buffer, preserving offset information
|
|
func (logBuffer *LogBuffer) AddLogEntryToBuffer(logEntry *filer_pb.LogEntry) {
|
|
logEntryData, _ := proto.Marshal(logEntry)
|
|
|
|
var toFlush *dataToFlush
|
|
logBuffer.Lock()
|
|
defer func() {
|
|
logBuffer.Unlock()
|
|
if toFlush != nil {
|
|
logBuffer.flushChan <- toFlush
|
|
}
|
|
if logBuffer.notifyFn != nil {
|
|
logBuffer.notifyFn()
|
|
}
|
|
// Notify all registered subscribers instantly (<1ms latency)
|
|
logBuffer.notifySubscribers()
|
|
}()
|
|
|
|
processingTsNs := logEntry.TsNs
|
|
ts := time.Unix(0, processingTsNs)
|
|
|
|
// Handle timestamp collision inside lock (rare case)
|
|
if logBuffer.LastTsNs.Load() >= processingTsNs {
|
|
processingTsNs = logBuffer.LastTsNs.Add(1)
|
|
ts = time.Unix(0, processingTsNs)
|
|
// Re-marshal with corrected timestamp
|
|
logEntry.TsNs = processingTsNs
|
|
logEntryData, _ = proto.Marshal(logEntry)
|
|
} else {
|
|
logBuffer.LastTsNs.Store(processingTsNs)
|
|
}
|
|
|
|
size := len(logEntryData)
|
|
|
|
if logBuffer.pos == 0 {
|
|
logBuffer.startTime = ts
|
|
// Reset offset tracking for new buffer
|
|
logBuffer.hasOffsets = false
|
|
}
|
|
|
|
// Track offset ranges for Kafka integration
|
|
// CRITICAL FIX: Use >= 0 to include offset 0 (first message in a topic)
|
|
if logEntry.Offset >= 0 {
|
|
if !logBuffer.hasOffsets {
|
|
logBuffer.minOffset = logEntry.Offset
|
|
logBuffer.maxOffset = logEntry.Offset
|
|
logBuffer.hasOffsets = true
|
|
} else {
|
|
if logEntry.Offset < logBuffer.minOffset {
|
|
logBuffer.minOffset = logEntry.Offset
|
|
}
|
|
if logEntry.Offset > logBuffer.maxOffset {
|
|
logBuffer.maxOffset = logEntry.Offset
|
|
}
|
|
}
|
|
}
|
|
|
|
if logBuffer.startTime.Add(logBuffer.flushInterval).Before(ts) || len(logBuffer.buf)-logBuffer.pos < size+4 {
|
|
toFlush = logBuffer.copyToFlush()
|
|
logBuffer.startTime = ts
|
|
if len(logBuffer.buf) < size+4 {
|
|
// Validate size to prevent integer overflow in computation BEFORE allocation
|
|
const maxBufferSize = 1 << 30 // 1 GiB practical limit
|
|
// Ensure 2*size + 4 won't overflow int and stays within practical bounds
|
|
if size < 0 || size > (math.MaxInt-4)/2 || size > (maxBufferSize-4)/2 {
|
|
glog.Errorf("Buffer size out of valid range: %d bytes, skipping", size)
|
|
return
|
|
}
|
|
// Safe to compute now that we've validated size is in valid range
|
|
newSize := 2*size + 4
|
|
logBuffer.buf = make([]byte, newSize)
|
|
}
|
|
}
|
|
logBuffer.stopTime = ts
|
|
|
|
logBuffer.idx = append(logBuffer.idx, logBuffer.pos)
|
|
util.Uint32toBytes(logBuffer.sizeBuf, uint32(size))
|
|
copy(logBuffer.buf[logBuffer.pos:logBuffer.pos+4], logBuffer.sizeBuf)
|
|
copy(logBuffer.buf[logBuffer.pos+4:logBuffer.pos+4+size], logEntryData)
|
|
logBuffer.pos += size + 4
|
|
|
|
logBuffer.offset++
|
|
}
|
|
|
|
func (logBuffer *LogBuffer) AddDataToBuffer(partitionKey, data []byte, processingTsNs int64) {
|
|
|
|
// PERFORMANCE OPTIMIZATION: Pre-process expensive operations OUTSIDE the lock
|
|
var ts time.Time
|
|
if processingTsNs == 0 {
|
|
ts = time.Now()
|
|
processingTsNs = ts.UnixNano()
|
|
} else {
|
|
ts = time.Unix(0, processingTsNs)
|
|
}
|
|
|
|
logEntry := &filer_pb.LogEntry{
|
|
TsNs: processingTsNs, // Will be updated if needed
|
|
PartitionKeyHash: util.HashToInt32(partitionKey),
|
|
Data: data,
|
|
Key: partitionKey,
|
|
}
|
|
|
|
logEntryData, _ := proto.Marshal(logEntry)
|
|
|
|
var toFlush *dataToFlush
|
|
logBuffer.Lock()
|
|
defer func() {
|
|
logBuffer.Unlock()
|
|
if toFlush != nil {
|
|
logBuffer.flushChan <- toFlush
|
|
}
|
|
if logBuffer.notifyFn != nil {
|
|
logBuffer.notifyFn()
|
|
}
|
|
// Notify all registered subscribers instantly (<1ms latency)
|
|
logBuffer.notifySubscribers()
|
|
}()
|
|
|
|
// Handle timestamp collision inside lock (rare case)
|
|
if logBuffer.LastTsNs.Load() >= processingTsNs {
|
|
processingTsNs = logBuffer.LastTsNs.Add(1)
|
|
ts = time.Unix(0, processingTsNs)
|
|
// Re-marshal with corrected timestamp
|
|
logEntry.TsNs = processingTsNs
|
|
logEntryData, _ = proto.Marshal(logEntry)
|
|
} else {
|
|
logBuffer.LastTsNs.Store(processingTsNs)
|
|
}
|
|
|
|
size := len(logEntryData)
|
|
|
|
if logBuffer.pos == 0 {
|
|
logBuffer.startTime = ts
|
|
}
|
|
|
|
if logBuffer.startTime.Add(logBuffer.flushInterval).Before(ts) || len(logBuffer.buf)-logBuffer.pos < size+4 {
|
|
// glog.V(0).Infof("%s copyToFlush1 offset:%d count:%d start time %v, ts %v, remaining %d bytes", logBuffer.name, logBuffer.offset, len(logBuffer.idx), logBuffer.startTime, ts, len(logBuffer.buf)-logBuffer.pos)
|
|
toFlush = logBuffer.copyToFlush()
|
|
logBuffer.startTime = ts
|
|
if len(logBuffer.buf) < size+4 {
|
|
// Validate size to prevent integer overflow in computation BEFORE allocation
|
|
const maxBufferSize = 1 << 30 // 1 GiB practical limit
|
|
// Ensure 2*size + 4 won't overflow int and stays within practical bounds
|
|
if size < 0 || size > (math.MaxInt-4)/2 || size > (maxBufferSize-4)/2 {
|
|
glog.Errorf("Buffer size out of valid range: %d bytes, skipping", size)
|
|
return
|
|
}
|
|
// Safe to compute now that we've validated size is in valid range
|
|
newSize := 2*size + 4
|
|
logBuffer.buf = make([]byte, newSize)
|
|
}
|
|
}
|
|
logBuffer.stopTime = ts
|
|
|
|
logBuffer.idx = append(logBuffer.idx, logBuffer.pos)
|
|
util.Uint32toBytes(logBuffer.sizeBuf, uint32(size))
|
|
copy(logBuffer.buf[logBuffer.pos:logBuffer.pos+4], logBuffer.sizeBuf)
|
|
copy(logBuffer.buf[logBuffer.pos+4:logBuffer.pos+4+size], logEntryData)
|
|
logBuffer.pos += size + 4
|
|
|
|
}
|
|
|
|
func (logBuffer *LogBuffer) IsStopping() bool {
|
|
return logBuffer.isStopping.Load()
|
|
}
|
|
|
|
// ForceFlush immediately flushes the current buffer content and WAITS for completion
|
|
// This is useful for critical topics that need immediate persistence
|
|
// CRITICAL: This function is now SYNCHRONOUS - it blocks until the flush completes
|
|
func (logBuffer *LogBuffer) ForceFlush() {
|
|
if logBuffer.isStopping.Load() {
|
|
return // Don't flush if we're shutting down
|
|
}
|
|
|
|
logBuffer.Lock()
|
|
toFlush := logBuffer.copyToFlushWithCallback()
|
|
logBuffer.Unlock()
|
|
|
|
if toFlush != nil {
|
|
// Send to flush channel (with reasonable timeout)
|
|
select {
|
|
case logBuffer.flushChan <- toFlush:
|
|
// Successfully queued for flush - now WAIT for it to complete
|
|
select {
|
|
case <-toFlush.done:
|
|
// Flush completed successfully
|
|
glog.V(1).Infof("ForceFlush completed for %s", logBuffer.name)
|
|
case <-time.After(5 * time.Second):
|
|
// Timeout waiting for flush - this shouldn't happen
|
|
glog.Warningf("ForceFlush timed out waiting for completion on %s", logBuffer.name)
|
|
}
|
|
case <-time.After(2 * time.Second):
|
|
// If flush channel is still blocked after 2s, something is wrong
|
|
glog.Warningf("ForceFlush channel timeout for %s - flush channel busy for 2s", logBuffer.name)
|
|
}
|
|
}
|
|
}
|
|
|
|
// ShutdownLogBuffer flushes the buffer and stops the log buffer
|
|
func (logBuffer *LogBuffer) ShutdownLogBuffer() {
|
|
isAlreadyStopped := logBuffer.isStopping.Swap(true)
|
|
if isAlreadyStopped {
|
|
return
|
|
}
|
|
toFlush := logBuffer.copyToFlush()
|
|
logBuffer.flushChan <- toFlush
|
|
close(logBuffer.flushChan)
|
|
}
|
|
|
|
// IsAllFlushed returns true if all data in the buffer has been flushed, after calling ShutdownLogBuffer().
|
|
func (logBuffer *LogBuffer) IsAllFlushed() bool {
|
|
return logBuffer.isAllFlushed
|
|
}
|
|
|
|
func (logBuffer *LogBuffer) loopFlush() {
|
|
for d := range logBuffer.flushChan {
|
|
if d != nil {
|
|
// glog.V(4).Infof("%s flush [%v, %v] size %d", m.name, d.startTime, d.stopTime, len(d.data.Bytes()))
|
|
logBuffer.flushFn(logBuffer, d.startTime, d.stopTime, d.data.Bytes(), d.minOffset, d.maxOffset)
|
|
d.releaseMemory()
|
|
// local logbuffer is different from aggregate logbuffer here
|
|
logBuffer.lastFlushDataTime = d.stopTime
|
|
|
|
// CRITICAL: Track what's been flushed to disk for both offset-based and time-based reads
|
|
// CRITICAL FIX: Use >= 0 to include offset 0 (first message in a topic)
|
|
if d.maxOffset >= 0 {
|
|
logBuffer.lastFlushedOffset.Store(d.maxOffset)
|
|
}
|
|
if !d.stopTime.IsZero() {
|
|
logBuffer.lastFlushedTime.Store(d.stopTime.UnixNano())
|
|
}
|
|
|
|
// Signal completion if there's a callback channel
|
|
if d.done != nil {
|
|
close(d.done)
|
|
}
|
|
}
|
|
}
|
|
logBuffer.isAllFlushed = true
|
|
}
|
|
|
|
func (logBuffer *LogBuffer) loopInterval() {
|
|
for !logBuffer.IsStopping() {
|
|
time.Sleep(logBuffer.flushInterval)
|
|
if logBuffer.IsStopping() {
|
|
return
|
|
}
|
|
|
|
logBuffer.Lock()
|
|
toFlush := logBuffer.copyToFlush()
|
|
logBuffer.Unlock()
|
|
if toFlush != nil {
|
|
glog.V(4).Infof("%s flush [%v, %v] size %d", logBuffer.name, toFlush.startTime, toFlush.stopTime, len(toFlush.data.Bytes()))
|
|
logBuffer.flushChan <- toFlush
|
|
} else {
|
|
// glog.V(0).Infof("%s no flush", m.name)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (logBuffer *LogBuffer) copyToFlush() *dataToFlush {
|
|
return logBuffer.copyToFlushInternal(false)
|
|
}
|
|
|
|
func (logBuffer *LogBuffer) copyToFlushWithCallback() *dataToFlush {
|
|
return logBuffer.copyToFlushInternal(true)
|
|
}
|
|
|
|
func (logBuffer *LogBuffer) copyToFlushInternal(withCallback bool) *dataToFlush {
|
|
|
|
if logBuffer.pos > 0 {
|
|
var d *dataToFlush
|
|
if logBuffer.flushFn != nil {
|
|
d = &dataToFlush{
|
|
startTime: logBuffer.startTime,
|
|
stopTime: logBuffer.stopTime,
|
|
data: copiedBytes(logBuffer.buf[:logBuffer.pos]),
|
|
minOffset: logBuffer.minOffset,
|
|
maxOffset: logBuffer.maxOffset,
|
|
}
|
|
// Add callback channel for synchronous ForceFlush
|
|
if withCallback {
|
|
d.done = make(chan struct{})
|
|
}
|
|
// glog.V(4).Infof("%s flushing [0,%d) with %d entries [%v, %v]", m.name, m.pos, len(m.idx), m.startTime, m.stopTime)
|
|
} else {
|
|
// glog.V(4).Infof("%s removed from memory [0,%d) with %d entries [%v, %v]", m.name, m.pos, len(m.idx), m.startTime, m.stopTime)
|
|
logBuffer.lastFlushDataTime = logBuffer.stopTime
|
|
}
|
|
// CRITICAL: logBuffer.offset is the "next offset to assign", so last offset in buffer is offset-1
|
|
lastOffsetInBuffer := logBuffer.offset - 1
|
|
logBuffer.buf = logBuffer.prevBuffers.SealBuffer(logBuffer.startTime, logBuffer.stopTime, logBuffer.buf, logBuffer.pos, logBuffer.bufferStartOffset, lastOffsetInBuffer)
|
|
logBuffer.startTime = time.Unix(0, 0)
|
|
logBuffer.stopTime = time.Unix(0, 0)
|
|
logBuffer.pos = 0
|
|
logBuffer.idx = logBuffer.idx[:0]
|
|
// DON'T increment offset - it's already pointing to the next offset!
|
|
// logBuffer.offset++ // REMOVED - this was causing offset gaps!
|
|
logBuffer.bufferStartOffset = logBuffer.offset // Next buffer starts at current offset (which is already the next one)
|
|
// Reset offset tracking
|
|
logBuffer.hasOffsets = false
|
|
logBuffer.minOffset = 0
|
|
logBuffer.maxOffset = 0
|
|
return d
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (logBuffer *LogBuffer) GetEarliestTime() time.Time {
|
|
return logBuffer.startTime
|
|
}
|
|
func (logBuffer *LogBuffer) GetEarliestPosition() MessagePosition {
|
|
return MessagePosition{
|
|
Time: logBuffer.startTime,
|
|
Offset: logBuffer.offset,
|
|
}
|
|
}
|
|
|
|
func (d *dataToFlush) releaseMemory() {
|
|
d.data.Reset()
|
|
bufferPool.Put(d.data)
|
|
}
|
|
|
|
func (logBuffer *LogBuffer) ReadFromBuffer(lastReadPosition MessagePosition) (bufferCopy *bytes.Buffer, batchIndex int64, err error) {
|
|
logBuffer.RLock()
|
|
defer logBuffer.RUnlock()
|
|
|
|
isOffsetBased := lastReadPosition.IsOffsetBased
|
|
|
|
// CRITICAL FIX: For offset-based subscriptions, use offset comparisons, not time comparisons!
|
|
if isOffsetBased {
|
|
requestedOffset := lastReadPosition.Offset
|
|
|
|
// DEBUG: Log buffer state for _schemas topic
|
|
if strings.Contains(logBuffer.name, "_schemas") {
|
|
glog.Infof("[SCHEMAS ReadFromBuffer] requested=%d bufferStart=%d bufferEnd=%d pos=%d lastFlushed=%d",
|
|
requestedOffset, logBuffer.bufferStartOffset, logBuffer.offset, logBuffer.pos, logBuffer.lastFlushedOffset.Load())
|
|
}
|
|
|
|
// Check if the requested offset is in the current buffer range
|
|
if requestedOffset >= logBuffer.bufferStartOffset && requestedOffset <= logBuffer.offset {
|
|
// If current buffer is empty (pos=0), check if data is on disk or not yet written
|
|
if logBuffer.pos == 0 {
|
|
// CRITICAL FIX: If buffer is empty but offset range covers the request,
|
|
// it means data was in memory and has been flushed/moved out.
|
|
// The bufferStartOffset advancing to cover this offset proves data existed.
|
|
//
|
|
// Three cases:
|
|
// 1. requestedOffset < logBuffer.offset: Data was here, now flushed
|
|
// 2. requestedOffset == logBuffer.offset && bufferStartOffset > 0: Buffer advanced, data flushed
|
|
// 3. requestedOffset == logBuffer.offset && bufferStartOffset == 0: Initial state - try disk first!
|
|
//
|
|
// Cases 1 & 2: try disk read
|
|
// Case 3: try disk read (historical data might exist)
|
|
if requestedOffset < logBuffer.offset {
|
|
// Data was in the buffer range but buffer is now empty = flushed to disk
|
|
if strings.Contains(logBuffer.name, "_schemas") {
|
|
glog.Infof("[SCHEMAS ReadFromBuffer] Returning ResumeFromDiskError: empty buffer, offset %d was flushed (bufferStart=%d, offset=%d)",
|
|
requestedOffset, logBuffer.bufferStartOffset, logBuffer.offset)
|
|
}
|
|
return nil, -2, ResumeFromDiskError
|
|
}
|
|
// requestedOffset == logBuffer.offset: Current position
|
|
// CRITICAL: For subscribers starting from offset 0, try disk read first
|
|
// (historical data might exist from previous runs)
|
|
if requestedOffset == 0 && logBuffer.bufferStartOffset == 0 && logBuffer.offset == 0 {
|
|
// Initial state: try disk read before waiting for new data
|
|
if strings.Contains(logBuffer.name, "_schemas") {
|
|
glog.Infof("[SCHEMAS ReadFromBuffer] Initial state, trying disk read for offset 0")
|
|
}
|
|
return nil, -2, ResumeFromDiskError
|
|
}
|
|
// Otherwise, wait for new data to arrive
|
|
if strings.Contains(logBuffer.name, "_schemas") {
|
|
glog.Infof("[SCHEMAS ReadFromBuffer] Returning nil: waiting for offset %d to arrive", requestedOffset)
|
|
}
|
|
return nil, logBuffer.offset, nil
|
|
}
|
|
if strings.Contains(logBuffer.name, "_schemas") {
|
|
glog.Infof("[SCHEMAS ReadFromBuffer] Returning %d bytes from buffer", logBuffer.pos)
|
|
}
|
|
return copiedBytes(logBuffer.buf[:logBuffer.pos]), logBuffer.offset, nil
|
|
}
|
|
|
|
// Check previous buffers for the requested offset
|
|
for _, buf := range logBuffer.prevBuffers.buffers {
|
|
if requestedOffset >= buf.startOffset && requestedOffset <= buf.offset {
|
|
// If prevBuffer is empty, it means the data was flushed to disk
|
|
// (prevBuffers are created when buffer is flushed)
|
|
if buf.size == 0 {
|
|
// Empty prevBuffer covering this offset means data was flushed
|
|
return nil, -2, ResumeFromDiskError
|
|
}
|
|
return copiedBytes(buf.buf[:buf.size]), buf.offset, nil
|
|
}
|
|
}
|
|
|
|
// Offset not found in any buffer
|
|
if requestedOffset < logBuffer.bufferStartOffset {
|
|
// Data not in current buffers - must be on disk (flushed or never existed)
|
|
// Return ResumeFromDiskError to trigger disk read
|
|
return nil, -2, ResumeFromDiskError
|
|
}
|
|
|
|
if requestedOffset > logBuffer.offset {
|
|
// Future data, not available yet
|
|
return nil, logBuffer.offset, nil
|
|
}
|
|
|
|
// Offset not found - return nil
|
|
return nil, logBuffer.offset, nil
|
|
}
|
|
|
|
// TIMESTAMP-BASED READ (original logic)
|
|
// Read from disk and memory
|
|
// 1. read from disk, last time is = td
|
|
// 2. in memory, the earliest time = tm
|
|
// if tm <= td, case 2.1
|
|
// read from memory
|
|
// if tm is empty, case 2.2
|
|
// read from memory
|
|
// if td < tm, case 2.3
|
|
// read from disk again
|
|
var tsMemory time.Time
|
|
var tsBatchIndex int64
|
|
if !logBuffer.startTime.IsZero() {
|
|
tsMemory = logBuffer.startTime
|
|
tsBatchIndex = logBuffer.offset
|
|
}
|
|
for _, prevBuf := range logBuffer.prevBuffers.buffers {
|
|
if !prevBuf.startTime.IsZero() && prevBuf.startTime.Before(tsMemory) {
|
|
tsMemory = prevBuf.startTime
|
|
tsBatchIndex = prevBuf.offset
|
|
}
|
|
}
|
|
if tsMemory.IsZero() { // case 2.2
|
|
return nil, -2, nil
|
|
} else if lastReadPosition.Time.Before(tsMemory) && lastReadPosition.Offset+1 < tsBatchIndex { // case 2.3
|
|
// Special case: If requested time is zero (Unix epoch), treat as "start from beginning"
|
|
// This handles queries that want to read all data without knowing the exact start time
|
|
if lastReadPosition.Time.IsZero() || lastReadPosition.Time.Unix() == 0 {
|
|
// Start from the beginning of memory
|
|
// Fall through to case 2.1 to read from earliest buffer
|
|
} else {
|
|
// Data not in memory buffers - read from disk
|
|
glog.V(0).Infof("resume from disk: requested time %v < earliest memory time %v",
|
|
lastReadPosition.Time, tsMemory)
|
|
return nil, -2, ResumeFromDiskError
|
|
}
|
|
}
|
|
|
|
// the following is case 2.1
|
|
|
|
if lastReadPosition.Time.Equal(logBuffer.stopTime) {
|
|
return nil, logBuffer.offset, nil
|
|
}
|
|
if lastReadPosition.Time.After(logBuffer.stopTime) {
|
|
// glog.Fatalf("unexpected last read time %v, older than latest %v", lastReadPosition, m.stopTime)
|
|
return nil, logBuffer.offset, nil
|
|
}
|
|
if lastReadPosition.Time.Before(logBuffer.startTime) {
|
|
for _, buf := range logBuffer.prevBuffers.buffers {
|
|
if buf.startTime.After(lastReadPosition.Time) {
|
|
// glog.V(4).Infof("%s return the %d sealed buffer %v", m.name, i, buf.startTime)
|
|
return copiedBytes(buf.buf[:buf.size]), buf.offset, nil
|
|
}
|
|
if !buf.startTime.After(lastReadPosition.Time) && buf.stopTime.After(lastReadPosition.Time) {
|
|
pos := buf.locateByTs(lastReadPosition.Time)
|
|
return copiedBytes(buf.buf[pos:buf.size]), buf.offset, nil
|
|
}
|
|
}
|
|
// glog.V(4).Infof("%s return the current buf %v", m.name, lastReadPosition)
|
|
return copiedBytes(logBuffer.buf[:logBuffer.pos]), logBuffer.offset, nil
|
|
}
|
|
|
|
lastTs := lastReadPosition.Time.UnixNano()
|
|
l, h := 0, len(logBuffer.idx)-1
|
|
|
|
/*
|
|
for i, pos := range m.idx {
|
|
logEntry, ts := readTs(m.buf, pos)
|
|
event := &filer_pb.SubscribeMetadataResponse{}
|
|
proto.Unmarshal(logEntry.Data, event)
|
|
entry := event.EventNotification.OldEntry
|
|
if entry == nil {
|
|
entry = event.EventNotification.NewEntry
|
|
}
|
|
}
|
|
*/
|
|
|
|
for l <= h {
|
|
mid := (l + h) / 2
|
|
pos := logBuffer.idx[mid]
|
|
_, t := readTs(logBuffer.buf, pos)
|
|
if t <= lastTs {
|
|
l = mid + 1
|
|
} else if lastTs < t {
|
|
var prevT int64
|
|
if mid > 0 {
|
|
_, prevT = readTs(logBuffer.buf, logBuffer.idx[mid-1])
|
|
}
|
|
if prevT <= lastTs {
|
|
return copiedBytes(logBuffer.buf[pos:logBuffer.pos]), logBuffer.offset, nil
|
|
}
|
|
h = mid
|
|
}
|
|
}
|
|
|
|
// Binary search didn't find the timestamp - data may have been flushed to disk already
|
|
// Returning -2 signals to caller that data is not available in memory
|
|
return nil, -2, nil
|
|
|
|
}
|
|
func (logBuffer *LogBuffer) ReleaseMemory(b *bytes.Buffer) {
|
|
bufferPool.Put(b)
|
|
}
|
|
|
|
// GetName returns the log buffer name for metadata tracking
|
|
func (logBuffer *LogBuffer) GetName() string {
|
|
logBuffer.RLock()
|
|
defer logBuffer.RUnlock()
|
|
return logBuffer.name
|
|
}
|
|
|
|
// GetOffset returns the current offset for metadata tracking
|
|
func (logBuffer *LogBuffer) GetOffset() int64 {
|
|
logBuffer.RLock()
|
|
defer logBuffer.RUnlock()
|
|
return logBuffer.offset
|
|
}
|
|
|
|
var bufferPool = sync.Pool{
|
|
New: func() interface{} {
|
|
return new(bytes.Buffer)
|
|
},
|
|
}
|
|
|
|
func copiedBytes(buf []byte) (copied *bytes.Buffer) {
|
|
copied = bufferPool.Get().(*bytes.Buffer)
|
|
copied.Reset()
|
|
copied.Write(buf)
|
|
return
|
|
}
|
|
|
|
func readTs(buf []byte, pos int) (size int, ts int64) {
|
|
|
|
size = int(util.BytesToUint32(buf[pos : pos+4]))
|
|
entryData := buf[pos+4 : pos+4+size]
|
|
logEntry := &filer_pb.LogEntry{}
|
|
|
|
err := proto.Unmarshal(entryData, logEntry)
|
|
if err != nil {
|
|
glog.Fatalf("unexpected unmarshal filer_pb.LogEntry: %v", err)
|
|
}
|
|
return size, logEntry.TsNs
|
|
|
|
}
|