You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
552 lines
17 KiB
552 lines
17 KiB
package offset
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/filer"
|
|
"github.com/seaweedfs/seaweedfs/weed/filer_client"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
|
"google.golang.org/grpc"
|
|
)
|
|
|
|
// PersistentLedger extends Ledger with persistence capabilities
|
|
type PersistentLedger struct {
|
|
*Ledger
|
|
topicPartition string
|
|
storage LedgerStorage
|
|
}
|
|
|
|
// ConsumerOffsetKey represents the full key for consumer offset storage
|
|
type ConsumerOffsetKey struct {
|
|
Topic string
|
|
Partition int32
|
|
ConsumerGroup string
|
|
ConsumerGroupInstance string // Optional - can be empty
|
|
}
|
|
|
|
// String returns the string representation for use as map key
|
|
func (k ConsumerOffsetKey) String() string {
|
|
if k.ConsumerGroupInstance != "" {
|
|
return fmt.Sprintf("%s:%d:%s:%s", k.Topic, k.Partition, k.ConsumerGroup, k.ConsumerGroupInstance)
|
|
}
|
|
return fmt.Sprintf("%s:%d:%s", k.Topic, k.Partition, k.ConsumerGroup)
|
|
}
|
|
|
|
// LedgerStorage interface for persisting consumer group offset mappings
|
|
type LedgerStorage interface {
|
|
// SaveConsumerOffset persists a consumer's committed Kafka offset -> SMQ timestamp mapping
|
|
SaveConsumerOffset(key ConsumerOffsetKey, kafkaOffset, smqTimestamp int64, size int32) error
|
|
|
|
// LoadConsumerOffsets restores all offset mappings for a consumer group's topic-partition
|
|
LoadConsumerOffsets(key ConsumerOffsetKey) ([]OffsetEntry, error)
|
|
|
|
// GetConsumerHighWaterMark returns the highest committed Kafka offset for a consumer
|
|
GetConsumerHighWaterMark(key ConsumerOffsetKey) (int64, error)
|
|
|
|
// Legacy methods for backward compatibility (deprecated)
|
|
SaveOffsetMapping(topicPartition string, kafkaOffset, smqTimestamp int64, size int32) error
|
|
LoadOffsetMappings(topicPartition string) ([]OffsetEntry, error)
|
|
GetHighWaterMark(topicPartition string) (int64, error)
|
|
}
|
|
|
|
// NewPersistentLedger creates a ledger that persists to storage
|
|
func NewPersistentLedger(topicPartition string, storage LedgerStorage) (*PersistentLedger, error) {
|
|
// Try to restore from storage (legacy method for backward compatibility)
|
|
entries, err := storage.LoadOffsetMappings(topicPartition)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load offset mappings: %w", err)
|
|
}
|
|
|
|
// Determine next offset
|
|
var nextOffset int64 = 0
|
|
if len(entries) > 0 {
|
|
// Sort entries by offset to find the highest
|
|
sort.Slice(entries, func(i, j int) bool {
|
|
return entries[i].KafkaOffset < entries[j].KafkaOffset
|
|
})
|
|
nextOffset = entries[len(entries)-1].KafkaOffset + 1
|
|
}
|
|
|
|
// Create base ledger with restored state
|
|
ledger := &Ledger{
|
|
entries: entries,
|
|
nextOffset: nextOffset,
|
|
}
|
|
|
|
// Update earliest/latest timestamps
|
|
if len(entries) > 0 {
|
|
ledger.earliestTime = entries[0].Timestamp
|
|
ledger.latestTime = entries[len(entries)-1].Timestamp
|
|
}
|
|
|
|
return &PersistentLedger{
|
|
Ledger: ledger,
|
|
topicPartition: topicPartition,
|
|
storage: storage,
|
|
}, nil
|
|
}
|
|
|
|
// AppendRecord persists the offset mapping in addition to in-memory storage
|
|
func (pl *PersistentLedger) AppendRecord(kafkaOffset, timestamp int64, size int32) error {
|
|
// First persist to storage (legacy method for backward compatibility)
|
|
if err := pl.storage.SaveOffsetMapping(pl.topicPartition, kafkaOffset, timestamp, size); err != nil {
|
|
return fmt.Errorf("failed to persist offset mapping: %w", err)
|
|
}
|
|
|
|
// Then update in-memory ledger
|
|
return pl.Ledger.AppendRecord(kafkaOffset, timestamp, size)
|
|
}
|
|
|
|
// GetEntries returns the offset entries from the underlying ledger
|
|
func (pl *PersistentLedger) GetEntries() []OffsetEntry {
|
|
return pl.Ledger.GetEntries()
|
|
}
|
|
|
|
// SMQIntegratedStorage implements LedgerStorage using SMQ's in-memory replication pattern
|
|
// This approach avoids the scalability issue by using checkpoints instead of reading full history
|
|
type SMQIntegratedStorage struct {
|
|
filerAddress string
|
|
filerClientAccessor *filer_client.FilerClientAccessor
|
|
|
|
// In-memory replicated state (SMQ pattern)
|
|
ledgers sync.Map // map[ConsumerOffsetKey.String()]*ReplicatedOffsetLedger
|
|
|
|
// Configuration
|
|
checkpointInterval time.Duration
|
|
maxMemoryMappings int
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
}
|
|
|
|
// ReplicatedOffsetLedger represents in-memory consumer offset state with checkpoint persistence
|
|
type ReplicatedOffsetLedger struct {
|
|
consumerKey ConsumerOffsetKey
|
|
|
|
// In-memory mappings (recent entries only)
|
|
mappings sync.Map // map[int64]*OffsetEntry
|
|
currentOffset int64
|
|
maxOffset int64
|
|
|
|
// Checkpoint state
|
|
lastCheckpoint int64
|
|
lastCheckpointTime time.Time
|
|
lastPersistTime time.Time
|
|
|
|
// State management
|
|
mu sync.RWMutex
|
|
needsPersistence bool
|
|
}
|
|
|
|
// NewSMQIntegratedStorage creates SMQ-integrated offset storage
|
|
// This uses SMQ's proven in-memory replication + checkpoint persistence pattern
|
|
func NewSMQIntegratedStorage(brokers []string) (*SMQIntegratedStorage, error) {
|
|
if len(brokers) == 0 {
|
|
return nil, fmt.Errorf("no brokers provided")
|
|
}
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
// Use first broker as filer address (brokers typically run co-located with filer)
|
|
// In SMQ architecture, brokers connect to local filer instances
|
|
filerAddress := brokers[0]
|
|
|
|
// Create filer client accessor (like SMQ does)
|
|
filerClientAccessor := &filer_client.FilerClientAccessor{
|
|
GetFiler: func() pb.ServerAddress {
|
|
return pb.ServerAddress(filerAddress)
|
|
},
|
|
GetGrpcDialOption: func() grpc.DialOption {
|
|
return grpc.WithInsecure()
|
|
},
|
|
}
|
|
|
|
storage := &SMQIntegratedStorage{
|
|
filerAddress: filerAddress,
|
|
filerClientAccessor: filerClientAccessor,
|
|
checkpointInterval: 30 * time.Second, // SMQ-style periodic checkpoints
|
|
maxMemoryMappings: 10000, // Keep recent mappings in memory
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
}
|
|
|
|
// Start background checkpoint persistence (SMQ pattern)
|
|
go storage.backgroundCheckpointPersistence()
|
|
|
|
return storage, nil
|
|
}
|
|
|
|
// SaveConsumerOffset stores consumer offset mapping in memory (SMQ pattern) and triggers checkpoint if needed
|
|
func (s *SMQIntegratedStorage) SaveConsumerOffset(key ConsumerOffsetKey, kafkaOffset, smqTimestamp int64, size int32) error {
|
|
// Get or create replicated ledger for this consumer
|
|
ledger := s.getOrCreateLedger(key)
|
|
|
|
// Update in-memory state (like SMQ subscriber offsets)
|
|
entry := &OffsetEntry{
|
|
KafkaOffset: kafkaOffset,
|
|
Timestamp: smqTimestamp,
|
|
Size: size,
|
|
}
|
|
|
|
ledger.mu.Lock()
|
|
ledger.mappings.Store(kafkaOffset, entry)
|
|
ledger.currentOffset = kafkaOffset
|
|
if kafkaOffset > ledger.maxOffset {
|
|
ledger.maxOffset = kafkaOffset
|
|
}
|
|
ledger.needsPersistence = true
|
|
ledger.mu.Unlock()
|
|
|
|
// Trigger checkpoint if threshold reached (SMQ pattern)
|
|
if s.shouldCheckpoint(ledger) {
|
|
return s.persistCheckpoint(ledger)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// LoadConsumerOffsets loads checkpoint + in-memory state (SMQ pattern) - O(1) instead of O(n)!
|
|
func (s *SMQIntegratedStorage) LoadConsumerOffsets(key ConsumerOffsetKey) ([]OffsetEntry, error) {
|
|
ledger := s.getOrCreateLedger(key)
|
|
|
|
// Load from checkpoint if not already loaded (SMQ pattern)
|
|
if err := s.loadCheckpointIfNeeded(ledger); err != nil {
|
|
return nil, fmt.Errorf("failed to load checkpoint: %w", err)
|
|
}
|
|
|
|
// Return current in-memory state (fast!)
|
|
return s.getCurrentMappings(ledger), nil
|
|
}
|
|
|
|
// GetConsumerHighWaterMark returns consumer's next offset from in-memory state (fast!)
|
|
func (s *SMQIntegratedStorage) GetConsumerHighWaterMark(key ConsumerOffsetKey) (int64, error) {
|
|
ledger := s.getOrCreateLedger(key)
|
|
|
|
// Load checkpoint if needed
|
|
if err := s.loadCheckpointIfNeeded(ledger); err != nil {
|
|
return 0, fmt.Errorf("failed to load checkpoint: %w", err)
|
|
}
|
|
|
|
ledger.mu.RLock()
|
|
maxOffset := ledger.maxOffset
|
|
ledger.mu.RUnlock()
|
|
|
|
if maxOffset < 0 {
|
|
return 0, nil
|
|
}
|
|
return maxOffset + 1, nil
|
|
}
|
|
|
|
// Close persists all pending checkpoints and shuts down (SMQ pattern)
|
|
func (s *SMQIntegratedStorage) Close() error {
|
|
s.cancel()
|
|
|
|
// Persist all ledgers before shutdown (like SMQ on disconnect)
|
|
s.ledgers.Range(func(key, value interface{}) bool {
|
|
ledger := value.(*ReplicatedOffsetLedger)
|
|
if ledger.needsPersistence {
|
|
s.persistCheckpoint(ledger)
|
|
}
|
|
return true
|
|
})
|
|
|
|
return nil
|
|
}
|
|
|
|
// SMQ-style helper methods for in-memory replication + checkpoint persistence
|
|
|
|
// getOrCreateLedger gets or creates in-memory consumer ledger (SMQ pattern)
|
|
func (s *SMQIntegratedStorage) getOrCreateLedger(key ConsumerOffsetKey) *ReplicatedOffsetLedger {
|
|
keyStr := key.String()
|
|
if existing, ok := s.ledgers.Load(keyStr); ok {
|
|
return existing.(*ReplicatedOffsetLedger)
|
|
}
|
|
|
|
// Create new consumer ledger
|
|
ledger := &ReplicatedOffsetLedger{
|
|
consumerKey: key,
|
|
currentOffset: -1,
|
|
maxOffset: -1,
|
|
lastCheckpoint: -1,
|
|
needsPersistence: false,
|
|
}
|
|
|
|
// Try to store, return existing if already created by another goroutine
|
|
if actual, loaded := s.ledgers.LoadOrStore(keyStr, ledger); loaded {
|
|
return actual.(*ReplicatedOffsetLedger)
|
|
}
|
|
|
|
return ledger
|
|
}
|
|
|
|
// loadCheckpointIfNeeded loads checkpoint from filer if not already loaded (SMQ pattern)
|
|
func (s *SMQIntegratedStorage) loadCheckpointIfNeeded(ledger *ReplicatedOffsetLedger) error {
|
|
ledger.mu.Lock()
|
|
defer ledger.mu.Unlock()
|
|
|
|
// Already loaded?
|
|
if ledger.lastCheckpoint >= 0 {
|
|
return nil
|
|
}
|
|
|
|
// Load checkpoint from filer
|
|
checkpointDir := s.getCheckpointDir()
|
|
checkpointFile := ledger.consumerKey.String() + ".json"
|
|
|
|
err := s.filerClientAccessor.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
|
|
data, err := filer.ReadInsideFiler(client, checkpointDir, checkpointFile)
|
|
if err != nil {
|
|
return err // Will be handled below
|
|
}
|
|
|
|
var checkpoint CheckpointData
|
|
if err := json.Unmarshal(data, &checkpoint); err != nil {
|
|
return fmt.Errorf("failed to unmarshal checkpoint: %w", err)
|
|
}
|
|
|
|
// Restore state from checkpoint
|
|
ledger.lastCheckpoint = checkpoint.MaxOffset
|
|
ledger.maxOffset = checkpoint.MaxOffset
|
|
ledger.currentOffset = checkpoint.MaxOffset
|
|
ledger.lastCheckpointTime = time.Unix(0, checkpoint.TimestampNs)
|
|
|
|
// Load recent mappings (last N entries for fast access)
|
|
for _, entry := range checkpoint.RecentMappings {
|
|
ledger.mappings.Store(entry.KafkaOffset, &entry)
|
|
}
|
|
|
|
return nil
|
|
})
|
|
|
|
if err != nil && err != filer_pb.ErrNotFound {
|
|
return fmt.Errorf("failed to load checkpoint: %w", err)
|
|
}
|
|
|
|
// Mark as loaded even if no checkpoint found
|
|
if ledger.lastCheckpoint < 0 {
|
|
ledger.lastCheckpoint = 0
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// getCurrentMappings returns current in-memory mappings (SMQ pattern)
|
|
func (s *SMQIntegratedStorage) getCurrentMappings(ledger *ReplicatedOffsetLedger) []OffsetEntry {
|
|
var entries []OffsetEntry
|
|
|
|
ledger.mappings.Range(func(key, value interface{}) bool {
|
|
entry := value.(*OffsetEntry)
|
|
entries = append(entries, *entry)
|
|
return true
|
|
})
|
|
|
|
// Sort by Kafka offset
|
|
sort.Slice(entries, func(i, j int) bool {
|
|
return entries[i].KafkaOffset < entries[j].KafkaOffset
|
|
})
|
|
|
|
return entries
|
|
}
|
|
|
|
// shouldCheckpoint determines if checkpoint persistence is needed (SMQ pattern)
|
|
func (s *SMQIntegratedStorage) shouldCheckpoint(ledger *ReplicatedOffsetLedger) bool {
|
|
ledger.mu.RLock()
|
|
defer ledger.mu.RUnlock()
|
|
|
|
// Persist if:
|
|
// 1. Enough time has passed
|
|
// 2. Too many in-memory entries
|
|
// 3. Significant offset advancement
|
|
|
|
timeSinceLastCheckpoint := time.Since(ledger.lastCheckpointTime)
|
|
|
|
mappingCount := 0
|
|
ledger.mappings.Range(func(key, value interface{}) bool {
|
|
mappingCount++
|
|
return mappingCount < s.maxMemoryMappings // Stop counting if too many
|
|
})
|
|
|
|
offsetDelta := ledger.currentOffset - ledger.lastCheckpoint
|
|
|
|
return timeSinceLastCheckpoint > s.checkpointInterval ||
|
|
mappingCount >= s.maxMemoryMappings ||
|
|
offsetDelta >= 1000 // Significant advancement
|
|
}
|
|
|
|
// persistCheckpoint saves checkpoint to filer (SMQ pattern)
|
|
func (s *SMQIntegratedStorage) persistCheckpoint(ledger *ReplicatedOffsetLedger) error {
|
|
ledger.mu.Lock()
|
|
defer ledger.mu.Unlock()
|
|
|
|
// Collect recent mappings for checkpoint
|
|
var recentMappings []OffsetEntry
|
|
ledger.mappings.Range(func(key, value interface{}) bool {
|
|
entry := value.(*OffsetEntry)
|
|
recentMappings = append(recentMappings, *entry)
|
|
return len(recentMappings) < 1000 // Keep last 1000 entries in checkpoint
|
|
})
|
|
|
|
// Sort by offset (keep most recent)
|
|
sort.Slice(recentMappings, func(i, j int) bool {
|
|
return recentMappings[i].KafkaOffset > recentMappings[j].KafkaOffset
|
|
})
|
|
if len(recentMappings) > 1000 {
|
|
recentMappings = recentMappings[:1000]
|
|
}
|
|
|
|
// Create checkpoint
|
|
checkpoint := CheckpointData{
|
|
ConsumerKey: ledger.consumerKey,
|
|
MaxOffset: ledger.maxOffset,
|
|
TimestampNs: time.Now().UnixNano(),
|
|
RecentMappings: recentMappings,
|
|
TopicPartition: fmt.Sprintf("%s:%d", ledger.consumerKey.Topic, ledger.consumerKey.Partition), // Legacy compatibility
|
|
}
|
|
|
|
// Marshal checkpoint
|
|
data, err := json.Marshal(checkpoint)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal checkpoint: %w", err)
|
|
}
|
|
|
|
// Write to filer
|
|
checkpointDir := s.getCheckpointDir()
|
|
checkpointFile := ledger.consumerKey.String() + ".json"
|
|
err = s.filerClientAccessor.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {
|
|
return filer.SaveInsideFiler(client, checkpointDir, checkpointFile, data)
|
|
})
|
|
|
|
if err != nil {
|
|
return fmt.Errorf("failed to save checkpoint: %w", err)
|
|
}
|
|
|
|
// Update checkpoint state
|
|
ledger.lastCheckpoint = ledger.maxOffset
|
|
ledger.lastCheckpointTime = time.Now()
|
|
ledger.lastPersistTime = time.Now()
|
|
ledger.needsPersistence = false
|
|
|
|
return nil
|
|
}
|
|
|
|
// backgroundCheckpointPersistence runs periodic checkpoint saves (SMQ pattern)
|
|
func (s *SMQIntegratedStorage) backgroundCheckpointPersistence() {
|
|
ticker := time.NewTicker(s.checkpointInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-s.ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
// Persist all ledgers that need it
|
|
s.ledgers.Range(func(key, value interface{}) bool {
|
|
ledger := value.(*ReplicatedOffsetLedger)
|
|
if ledger.needsPersistence && s.shouldCheckpoint(ledger) {
|
|
if err := s.persistCheckpoint(ledger); err != nil {
|
|
// Log error but continue
|
|
fmt.Printf("Failed to persist checkpoint for %s: %v\n", ledger.consumerKey.String(), err)
|
|
}
|
|
}
|
|
return true
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
// getCheckpointDir returns filer directory for checkpoints
|
|
func (s *SMQIntegratedStorage) getCheckpointDir() string {
|
|
return "/kafka-offsets/checkpoints"
|
|
}
|
|
|
|
// CheckpointData represents persisted consumer checkpoint state
|
|
type CheckpointData struct {
|
|
ConsumerKey ConsumerOffsetKey `json:"consumer_key"`
|
|
MaxOffset int64 `json:"max_offset"`
|
|
TimestampNs int64 `json:"timestamp_ns"`
|
|
RecentMappings []OffsetEntry `json:"recent_mappings"`
|
|
|
|
// Legacy field for backward compatibility
|
|
TopicPartition string `json:"topic_partition,omitempty"`
|
|
}
|
|
|
|
// Legacy methods for backward compatibility (will be deprecated)
|
|
|
|
// SaveOffsetMapping - legacy method that maps to topic-partition only (no consumer group info)
|
|
func (s *SMQIntegratedStorage) SaveOffsetMapping(topicPartition string, kafkaOffset, smqTimestamp int64, size int32) error {
|
|
// Parse topic:partition format
|
|
parts := strings.Split(topicPartition, ":")
|
|
if len(parts) != 2 {
|
|
return fmt.Errorf("invalid topic-partition format: %s", topicPartition)
|
|
}
|
|
|
|
partition, err := strconv.ParseInt(parts[1], 10, 32)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid partition number in %s: %w", topicPartition, err)
|
|
}
|
|
|
|
// Use legacy consumer key (no consumer group)
|
|
legacyKey := ConsumerOffsetKey{
|
|
Topic: parts[0],
|
|
Partition: int32(partition),
|
|
ConsumerGroup: "_legacy_",
|
|
ConsumerGroupInstance: "",
|
|
}
|
|
|
|
return s.SaveConsumerOffset(legacyKey, kafkaOffset, smqTimestamp, size)
|
|
}
|
|
|
|
// LoadOffsetMappings - legacy method that loads from topic-partition only
|
|
func (s *SMQIntegratedStorage) LoadOffsetMappings(topicPartition string) ([]OffsetEntry, error) {
|
|
// Parse topic:partition format
|
|
parts := strings.Split(topicPartition, ":")
|
|
if len(parts) != 2 {
|
|
return nil, fmt.Errorf("invalid topic-partition format: %s", topicPartition)
|
|
}
|
|
|
|
partition, err := strconv.ParseInt(parts[1], 10, 32)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid partition number in %s: %w", topicPartition, err)
|
|
}
|
|
|
|
// Use legacy consumer key (no consumer group)
|
|
legacyKey := ConsumerOffsetKey{
|
|
Topic: parts[0],
|
|
Partition: int32(partition),
|
|
ConsumerGroup: "_legacy_",
|
|
ConsumerGroupInstance: "",
|
|
}
|
|
|
|
return s.LoadConsumerOffsets(legacyKey)
|
|
}
|
|
|
|
// GetHighWaterMark - legacy method that gets high water mark for topic-partition only
|
|
func (s *SMQIntegratedStorage) GetHighWaterMark(topicPartition string) (int64, error) {
|
|
// Parse topic:partition format
|
|
parts := strings.Split(topicPartition, ":")
|
|
if len(parts) != 2 {
|
|
return 0, fmt.Errorf("invalid topic-partition format: %s", topicPartition)
|
|
}
|
|
|
|
partition, err := strconv.ParseInt(parts[1], 10, 32)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("invalid partition number in %s: %w", topicPartition, err)
|
|
}
|
|
|
|
// Use legacy consumer key (no consumer group)
|
|
legacyKey := ConsumerOffsetKey{
|
|
Topic: parts[0],
|
|
Partition: int32(partition),
|
|
ConsumerGroup: "_legacy_",
|
|
ConsumerGroupInstance: "",
|
|
}
|
|
|
|
return s.GetConsumerHighWaterMark(legacyKey)
|
|
}
|