You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
399 lines
12 KiB
399 lines
12 KiB
package consumer
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// GroupState represents the state of a consumer group
|
|
type GroupState int
|
|
|
|
const (
|
|
GroupStateEmpty GroupState = iota
|
|
GroupStatePreparingRebalance
|
|
GroupStateCompletingRebalance
|
|
GroupStateStable
|
|
GroupStateDead
|
|
)
|
|
|
|
func (gs GroupState) String() string {
|
|
switch gs {
|
|
case GroupStateEmpty:
|
|
return "Empty"
|
|
case GroupStatePreparingRebalance:
|
|
return "PreparingRebalance"
|
|
case GroupStateCompletingRebalance:
|
|
return "CompletingRebalance"
|
|
case GroupStateStable:
|
|
return "Stable"
|
|
case GroupStateDead:
|
|
return "Dead"
|
|
default:
|
|
return "Unknown"
|
|
}
|
|
}
|
|
|
|
// MemberState represents the state of a group member
|
|
type MemberState int
|
|
|
|
const (
|
|
MemberStateUnknown MemberState = iota
|
|
MemberStatePending
|
|
MemberStateStable
|
|
MemberStateLeaving
|
|
)
|
|
|
|
func (ms MemberState) String() string {
|
|
switch ms {
|
|
case MemberStateUnknown:
|
|
return "Unknown"
|
|
case MemberStatePending:
|
|
return "Pending"
|
|
case MemberStateStable:
|
|
return "Stable"
|
|
case MemberStateLeaving:
|
|
return "Leaving"
|
|
default:
|
|
return "Unknown"
|
|
}
|
|
}
|
|
|
|
// GroupMember represents a consumer in a consumer group
|
|
type GroupMember struct {
|
|
ID string // Member ID (generated by gateway)
|
|
ClientID string // Client ID from consumer
|
|
ClientHost string // Client host/IP
|
|
GroupInstanceID *string // Static membership instance ID (optional)
|
|
SessionTimeout int32 // Session timeout in milliseconds
|
|
RebalanceTimeout int32 // Rebalance timeout in milliseconds
|
|
Subscription []string // Subscribed topics
|
|
Assignment []PartitionAssignment // Assigned partitions
|
|
Metadata []byte // Protocol-specific metadata
|
|
State MemberState // Current member state
|
|
LastHeartbeat time.Time // Last heartbeat timestamp
|
|
JoinedAt time.Time // When member joined group
|
|
}
|
|
|
|
// PartitionAssignment represents partition assignment for a member
|
|
type PartitionAssignment struct {
|
|
Topic string
|
|
Partition int32
|
|
}
|
|
|
|
// ConsumerGroup represents a Kafka consumer group
|
|
type ConsumerGroup struct {
|
|
ID string // Group ID
|
|
State GroupState // Current group state
|
|
Generation int32 // Generation ID (incremented on rebalance)
|
|
Protocol string // Assignment protocol (e.g., "range", "roundrobin")
|
|
Leader string // Leader member ID
|
|
Members map[string]*GroupMember // Group members by member ID
|
|
StaticMembers map[string]string // Static instance ID -> member ID mapping
|
|
SubscribedTopics map[string]bool // Topics subscribed by group
|
|
OffsetCommits map[string]map[int32]OffsetCommit // Topic -> Partition -> Offset
|
|
CreatedAt time.Time // Group creation time
|
|
LastActivity time.Time // Last activity (join, heartbeat, etc.)
|
|
|
|
Mu sync.RWMutex // Protects group state
|
|
}
|
|
|
|
// OffsetCommit represents a committed offset for a topic partition
|
|
type OffsetCommit struct {
|
|
Offset int64 // Committed offset
|
|
Metadata string // Optional metadata
|
|
Timestamp time.Time // Commit timestamp
|
|
}
|
|
|
|
// GroupCoordinator manages consumer groups
|
|
type GroupCoordinator struct {
|
|
groups map[string]*ConsumerGroup // Group ID -> Group
|
|
groupsMu sync.RWMutex // Protects groups map
|
|
|
|
// Configuration
|
|
sessionTimeoutMin int32 // Minimum session timeout (ms)
|
|
sessionTimeoutMax int32 // Maximum session timeout (ms)
|
|
rebalanceTimeoutMs int32 // Default rebalance timeout (ms)
|
|
|
|
// Timeout management
|
|
rebalanceTimeoutManager *RebalanceTimeoutManager
|
|
|
|
// Cleanup
|
|
cleanupTicker *time.Ticker
|
|
stopChan chan struct{}
|
|
stopOnce sync.Once
|
|
}
|
|
|
|
// NewGroupCoordinator creates a new consumer group coordinator
|
|
func NewGroupCoordinator() *GroupCoordinator {
|
|
gc := &GroupCoordinator{
|
|
groups: make(map[string]*ConsumerGroup),
|
|
sessionTimeoutMin: 6000, // 6 seconds
|
|
sessionTimeoutMax: 300000, // 5 minutes
|
|
rebalanceTimeoutMs: 300000, // 5 minutes
|
|
stopChan: make(chan struct{}),
|
|
}
|
|
|
|
// Initialize rebalance timeout manager
|
|
gc.rebalanceTimeoutManager = NewRebalanceTimeoutManager(gc)
|
|
|
|
// Start cleanup routine
|
|
gc.cleanupTicker = time.NewTicker(30 * time.Second)
|
|
go gc.cleanupRoutine()
|
|
|
|
return gc
|
|
}
|
|
|
|
// GetOrCreateGroup returns an existing group or creates a new one
|
|
func (gc *GroupCoordinator) GetOrCreateGroup(groupID string) *ConsumerGroup {
|
|
gc.groupsMu.Lock()
|
|
defer gc.groupsMu.Unlock()
|
|
|
|
group, exists := gc.groups[groupID]
|
|
if !exists {
|
|
group = &ConsumerGroup{
|
|
ID: groupID,
|
|
State: GroupStateEmpty,
|
|
Generation: 0,
|
|
Members: make(map[string]*GroupMember),
|
|
StaticMembers: make(map[string]string),
|
|
SubscribedTopics: make(map[string]bool),
|
|
OffsetCommits: make(map[string]map[int32]OffsetCommit),
|
|
CreatedAt: time.Now(),
|
|
LastActivity: time.Now(),
|
|
}
|
|
gc.groups[groupID] = group
|
|
}
|
|
|
|
return group
|
|
}
|
|
|
|
// GetGroup returns an existing group or nil if not found
|
|
func (gc *GroupCoordinator) GetGroup(groupID string) *ConsumerGroup {
|
|
gc.groupsMu.RLock()
|
|
defer gc.groupsMu.RUnlock()
|
|
|
|
return gc.groups[groupID]
|
|
}
|
|
|
|
// RemoveGroup removes a group from the coordinator
|
|
func (gc *GroupCoordinator) RemoveGroup(groupID string) {
|
|
gc.groupsMu.Lock()
|
|
defer gc.groupsMu.Unlock()
|
|
|
|
delete(gc.groups, groupID)
|
|
}
|
|
|
|
// ListGroups returns all current group IDs
|
|
func (gc *GroupCoordinator) ListGroups() []string {
|
|
gc.groupsMu.RLock()
|
|
defer gc.groupsMu.RUnlock()
|
|
|
|
groups := make([]string, 0, len(gc.groups))
|
|
for groupID := range gc.groups {
|
|
groups = append(groups, groupID)
|
|
}
|
|
return groups
|
|
}
|
|
|
|
// FindStaticMember finds a member by static instance ID
|
|
func (gc *GroupCoordinator) FindStaticMember(group *ConsumerGroup, instanceID string) *GroupMember {
|
|
if instanceID == "" {
|
|
return nil
|
|
}
|
|
|
|
group.Mu.RLock()
|
|
defer group.Mu.RUnlock()
|
|
|
|
if memberID, exists := group.StaticMembers[instanceID]; exists {
|
|
return group.Members[memberID]
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// FindStaticMemberLocked finds a member by static instance ID (assumes group is already locked)
|
|
func (gc *GroupCoordinator) FindStaticMemberLocked(group *ConsumerGroup, instanceID string) *GroupMember {
|
|
if instanceID == "" {
|
|
return nil
|
|
}
|
|
|
|
if memberID, exists := group.StaticMembers[instanceID]; exists {
|
|
return group.Members[memberID]
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// RegisterStaticMember registers a static member in the group
|
|
func (gc *GroupCoordinator) RegisterStaticMember(group *ConsumerGroup, member *GroupMember) {
|
|
if member.GroupInstanceID == nil || *member.GroupInstanceID == "" {
|
|
return
|
|
}
|
|
|
|
group.Mu.Lock()
|
|
defer group.Mu.Unlock()
|
|
|
|
group.StaticMembers[*member.GroupInstanceID] = member.ID
|
|
}
|
|
|
|
// RegisterStaticMemberLocked registers a static member in the group (assumes group is already locked)
|
|
func (gc *GroupCoordinator) RegisterStaticMemberLocked(group *ConsumerGroup, member *GroupMember) {
|
|
if member.GroupInstanceID == nil || *member.GroupInstanceID == "" {
|
|
return
|
|
}
|
|
|
|
group.StaticMembers[*member.GroupInstanceID] = member.ID
|
|
}
|
|
|
|
// UnregisterStaticMember removes a static member from the group
|
|
func (gc *GroupCoordinator) UnregisterStaticMember(group *ConsumerGroup, instanceID string) {
|
|
if instanceID == "" {
|
|
return
|
|
}
|
|
|
|
group.Mu.Lock()
|
|
defer group.Mu.Unlock()
|
|
|
|
delete(group.StaticMembers, instanceID)
|
|
}
|
|
|
|
// UnregisterStaticMemberLocked removes a static member from the group (assumes group is already locked)
|
|
func (gc *GroupCoordinator) UnregisterStaticMemberLocked(group *ConsumerGroup, instanceID string) {
|
|
if instanceID == "" {
|
|
return
|
|
}
|
|
|
|
delete(group.StaticMembers, instanceID)
|
|
}
|
|
|
|
// IsStaticMember checks if a member is using static membership
|
|
func (gc *GroupCoordinator) IsStaticMember(member *GroupMember) bool {
|
|
return member.GroupInstanceID != nil && *member.GroupInstanceID != ""
|
|
}
|
|
|
|
// GenerateMemberID creates a deterministic member ID based on client info
|
|
func (gc *GroupCoordinator) GenerateMemberID(clientID, clientHost string) string {
|
|
// EXPERIMENT: Use simpler member ID format like real Kafka brokers
|
|
// Real Kafka uses format like: "consumer-1-uuid" or "consumer-groupId-uuid"
|
|
hash := fmt.Sprintf("%x", sha256.Sum256([]byte(clientID+"-"+clientHost)))
|
|
return fmt.Sprintf("consumer-%s", hash[:16]) // Shorter, simpler format
|
|
}
|
|
|
|
// ValidateSessionTimeout checks if session timeout is within acceptable range
|
|
func (gc *GroupCoordinator) ValidateSessionTimeout(timeout int32) bool {
|
|
return timeout >= gc.sessionTimeoutMin && timeout <= gc.sessionTimeoutMax
|
|
}
|
|
|
|
// cleanupRoutine periodically cleans up dead groups and expired members
|
|
func (gc *GroupCoordinator) cleanupRoutine() {
|
|
for {
|
|
select {
|
|
case <-gc.cleanupTicker.C:
|
|
gc.performCleanup()
|
|
case <-gc.stopChan:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// performCleanup removes expired members and empty groups
|
|
func (gc *GroupCoordinator) performCleanup() {
|
|
now := time.Now()
|
|
|
|
// Use rebalance timeout manager for more sophisticated timeout handling
|
|
gc.rebalanceTimeoutManager.CheckRebalanceTimeouts()
|
|
|
|
gc.groupsMu.Lock()
|
|
defer gc.groupsMu.Unlock()
|
|
|
|
for groupID, group := range gc.groups {
|
|
group.Mu.Lock()
|
|
|
|
// Check for expired members (session timeout)
|
|
expiredMembers := make([]string, 0)
|
|
for memberID, member := range group.Members {
|
|
sessionDuration := time.Duration(member.SessionTimeout) * time.Millisecond
|
|
timeSinceHeartbeat := now.Sub(member.LastHeartbeat)
|
|
if timeSinceHeartbeat > sessionDuration {
|
|
expiredMembers = append(expiredMembers, memberID)
|
|
}
|
|
}
|
|
|
|
// Remove expired members
|
|
for _, memberID := range expiredMembers {
|
|
delete(group.Members, memberID)
|
|
if group.Leader == memberID {
|
|
group.Leader = ""
|
|
}
|
|
}
|
|
|
|
// Update group state based on member count
|
|
if len(group.Members) == 0 {
|
|
if group.State != GroupStateEmpty {
|
|
group.State = GroupStateEmpty
|
|
group.Generation++
|
|
}
|
|
|
|
// Mark group for deletion if empty for too long (30 minutes)
|
|
if now.Sub(group.LastActivity) > 30*time.Minute {
|
|
group.State = GroupStateDead
|
|
}
|
|
}
|
|
|
|
// Check for stuck rebalances and force completion if necessary
|
|
maxRebalanceDuration := 10 * time.Minute // Maximum time allowed for rebalancing
|
|
if gc.rebalanceTimeoutManager.IsRebalanceStuck(group, maxRebalanceDuration) {
|
|
gc.rebalanceTimeoutManager.ForceCompleteRebalance(group)
|
|
}
|
|
|
|
group.Mu.Unlock()
|
|
|
|
// Remove dead groups
|
|
if group.State == GroupStateDead {
|
|
delete(gc.groups, groupID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Close shuts down the group coordinator
|
|
func (gc *GroupCoordinator) Close() {
|
|
gc.stopOnce.Do(func() {
|
|
close(gc.stopChan)
|
|
if gc.cleanupTicker != nil {
|
|
gc.cleanupTicker.Stop()
|
|
}
|
|
})
|
|
}
|
|
|
|
// GetGroupStats returns statistics about the group coordinator
|
|
func (gc *GroupCoordinator) GetGroupStats() map[string]interface{} {
|
|
gc.groupsMu.RLock()
|
|
defer gc.groupsMu.RUnlock()
|
|
|
|
stats := map[string]interface{}{
|
|
"total_groups": len(gc.groups),
|
|
"group_states": make(map[string]int),
|
|
}
|
|
|
|
stateCount := make(map[GroupState]int)
|
|
totalMembers := 0
|
|
|
|
for _, group := range gc.groups {
|
|
group.Mu.RLock()
|
|
stateCount[group.State]++
|
|
totalMembers += len(group.Members)
|
|
group.Mu.RUnlock()
|
|
}
|
|
|
|
stats["total_members"] = totalMembers
|
|
for state, count := range stateCount {
|
|
stats["group_states"].(map[string]int)[state.String()] = count
|
|
}
|
|
|
|
return stats
|
|
}
|
|
|
|
// GetRebalanceStatus returns the rebalance status for a specific group
|
|
func (gc *GroupCoordinator) GetRebalanceStatus(groupID string) *RebalanceStatus {
|
|
return gc.rebalanceTimeoutManager.GetRebalanceStatus(groupID)
|
|
}
|