You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							399 lines
						
					
					
						
							12 KiB
						
					
					
				
			
		
		
		
			
			
			
		
		
	
	
							399 lines
						
					
					
						
							12 KiB
						
					
					
				| package consumer | |
| 
 | |
| import ( | |
| 	"crypto/sha256" | |
| 	"fmt" | |
| 	"sync" | |
| 	"time" | |
| ) | |
| 
 | |
| // GroupState represents the state of a consumer group | |
| type GroupState int | |
| 
 | |
| const ( | |
| 	GroupStateEmpty GroupState = iota | |
| 	GroupStatePreparingRebalance | |
| 	GroupStateCompletingRebalance | |
| 	GroupStateStable | |
| 	GroupStateDead | |
| ) | |
| 
 | |
| func (gs GroupState) String() string { | |
| 	switch gs { | |
| 	case GroupStateEmpty: | |
| 		return "Empty" | |
| 	case GroupStatePreparingRebalance: | |
| 		return "PreparingRebalance" | |
| 	case GroupStateCompletingRebalance: | |
| 		return "CompletingRebalance" | |
| 	case GroupStateStable: | |
| 		return "Stable" | |
| 	case GroupStateDead: | |
| 		return "Dead" | |
| 	default: | |
| 		return "Unknown" | |
| 	} | |
| } | |
| 
 | |
| // MemberState represents the state of a group member | |
| type MemberState int | |
| 
 | |
| const ( | |
| 	MemberStateUnknown MemberState = iota | |
| 	MemberStatePending | |
| 	MemberStateStable | |
| 	MemberStateLeaving | |
| ) | |
| 
 | |
| func (ms MemberState) String() string { | |
| 	switch ms { | |
| 	case MemberStateUnknown: | |
| 		return "Unknown" | |
| 	case MemberStatePending: | |
| 		return "Pending" | |
| 	case MemberStateStable: | |
| 		return "Stable" | |
| 	case MemberStateLeaving: | |
| 		return "Leaving" | |
| 	default: | |
| 		return "Unknown" | |
| 	} | |
| } | |
| 
 | |
| // GroupMember represents a consumer in a consumer group | |
| type GroupMember struct { | |
| 	ID               string                // Member ID (generated by gateway) | |
| 	ClientID         string                // Client ID from consumer | |
| 	ClientHost       string                // Client host/IP | |
| 	GroupInstanceID  *string               // Static membership instance ID (optional) | |
| 	SessionTimeout   int32                 // Session timeout in milliseconds | |
| 	RebalanceTimeout int32                 // Rebalance timeout in milliseconds | |
| 	Subscription     []string              // Subscribed topics | |
| 	Assignment       []PartitionAssignment // Assigned partitions | |
| 	Metadata         []byte                // Protocol-specific metadata | |
| 	State            MemberState           // Current member state | |
| 	LastHeartbeat    time.Time             // Last heartbeat timestamp | |
| 	JoinedAt         time.Time             // When member joined group | |
| } | |
| 
 | |
| // PartitionAssignment represents partition assignment for a member | |
| type PartitionAssignment struct { | |
| 	Topic     string | |
| 	Partition int32 | |
| } | |
| 
 | |
| // ConsumerGroup represents a Kafka consumer group | |
| type ConsumerGroup struct { | |
| 	ID               string                            // Group ID | |
| 	State            GroupState                        // Current group state | |
| 	Generation       int32                             // Generation ID (incremented on rebalance) | |
| 	Protocol         string                            // Assignment protocol (e.g., "range", "roundrobin") | |
| 	Leader           string                            // Leader member ID | |
| 	Members          map[string]*GroupMember           // Group members by member ID | |
| 	StaticMembers    map[string]string                 // Static instance ID -> member ID mapping | |
| 	SubscribedTopics map[string]bool                   // Topics subscribed by group | |
| 	OffsetCommits    map[string]map[int32]OffsetCommit // Topic -> Partition -> Offset | |
| 	CreatedAt        time.Time                         // Group creation time | |
| 	LastActivity     time.Time                         // Last activity (join, heartbeat, etc.) | |
|  | |
| 	Mu sync.RWMutex // Protects group state | |
| } | |
| 
 | |
| // OffsetCommit represents a committed offset for a topic partition | |
| type OffsetCommit struct { | |
| 	Offset    int64     // Committed offset | |
| 	Metadata  string    // Optional metadata | |
| 	Timestamp time.Time // Commit timestamp | |
| } | |
| 
 | |
| // GroupCoordinator manages consumer groups | |
| type GroupCoordinator struct { | |
| 	groups   map[string]*ConsumerGroup // Group ID -> Group | |
| 	groupsMu sync.RWMutex              // Protects groups map | |
|  | |
| 	// Configuration | |
| 	sessionTimeoutMin  int32 // Minimum session timeout (ms) | |
| 	sessionTimeoutMax  int32 // Maximum session timeout (ms) | |
| 	rebalanceTimeoutMs int32 // Default rebalance timeout (ms) | |
|  | |
| 	// Timeout management | |
| 	rebalanceTimeoutManager *RebalanceTimeoutManager | |
| 
 | |
| 	// Cleanup | |
| 	cleanupTicker *time.Ticker | |
| 	stopChan      chan struct{} | |
| 	stopOnce      sync.Once | |
| } | |
| 
 | |
| // NewGroupCoordinator creates a new consumer group coordinator | |
| func NewGroupCoordinator() *GroupCoordinator { | |
| 	gc := &GroupCoordinator{ | |
| 		groups:             make(map[string]*ConsumerGroup), | |
| 		sessionTimeoutMin:  6000,   // 6 seconds | |
| 		sessionTimeoutMax:  300000, // 5 minutes | |
| 		rebalanceTimeoutMs: 300000, // 5 minutes | |
| 		stopChan:           make(chan struct{}), | |
| 	} | |
| 
 | |
| 	// Initialize rebalance timeout manager | |
| 	gc.rebalanceTimeoutManager = NewRebalanceTimeoutManager(gc) | |
| 
 | |
| 	// Start cleanup routine | |
| 	gc.cleanupTicker = time.NewTicker(30 * time.Second) | |
| 	go gc.cleanupRoutine() | |
| 
 | |
| 	return gc | |
| } | |
| 
 | |
| // GetOrCreateGroup returns an existing group or creates a new one | |
| func (gc *GroupCoordinator) GetOrCreateGroup(groupID string) *ConsumerGroup { | |
| 	gc.groupsMu.Lock() | |
| 	defer gc.groupsMu.Unlock() | |
| 
 | |
| 	group, exists := gc.groups[groupID] | |
| 	if !exists { | |
| 		group = &ConsumerGroup{ | |
| 			ID:               groupID, | |
| 			State:            GroupStateEmpty, | |
| 			Generation:       0, | |
| 			Members:          make(map[string]*GroupMember), | |
| 			StaticMembers:    make(map[string]string), | |
| 			SubscribedTopics: make(map[string]bool), | |
| 			OffsetCommits:    make(map[string]map[int32]OffsetCommit), | |
| 			CreatedAt:        time.Now(), | |
| 			LastActivity:     time.Now(), | |
| 		} | |
| 		gc.groups[groupID] = group | |
| 	} | |
| 
 | |
| 	return group | |
| } | |
| 
 | |
| // GetGroup returns an existing group or nil if not found | |
| func (gc *GroupCoordinator) GetGroup(groupID string) *ConsumerGroup { | |
| 	gc.groupsMu.RLock() | |
| 	defer gc.groupsMu.RUnlock() | |
| 
 | |
| 	return gc.groups[groupID] | |
| } | |
| 
 | |
| // RemoveGroup removes a group from the coordinator | |
| func (gc *GroupCoordinator) RemoveGroup(groupID string) { | |
| 	gc.groupsMu.Lock() | |
| 	defer gc.groupsMu.Unlock() | |
| 
 | |
| 	delete(gc.groups, groupID) | |
| } | |
| 
 | |
| // ListGroups returns all current group IDs | |
| func (gc *GroupCoordinator) ListGroups() []string { | |
| 	gc.groupsMu.RLock() | |
| 	defer gc.groupsMu.RUnlock() | |
| 
 | |
| 	groups := make([]string, 0, len(gc.groups)) | |
| 	for groupID := range gc.groups { | |
| 		groups = append(groups, groupID) | |
| 	} | |
| 	return groups | |
| } | |
| 
 | |
| // FindStaticMember finds a member by static instance ID | |
| func (gc *GroupCoordinator) FindStaticMember(group *ConsumerGroup, instanceID string) *GroupMember { | |
| 	if instanceID == "" { | |
| 		return nil | |
| 	} | |
| 
 | |
| 	group.Mu.RLock() | |
| 	defer group.Mu.RUnlock() | |
| 
 | |
| 	if memberID, exists := group.StaticMembers[instanceID]; exists { | |
| 		return group.Members[memberID] | |
| 	} | |
| 	return nil | |
| } | |
| 
 | |
| // FindStaticMemberLocked finds a member by static instance ID (assumes group is already locked) | |
| func (gc *GroupCoordinator) FindStaticMemberLocked(group *ConsumerGroup, instanceID string) *GroupMember { | |
| 	if instanceID == "" { | |
| 		return nil | |
| 	} | |
| 
 | |
| 	if memberID, exists := group.StaticMembers[instanceID]; exists { | |
| 		return group.Members[memberID] | |
| 	} | |
| 	return nil | |
| } | |
| 
 | |
| // RegisterStaticMember registers a static member in the group | |
| func (gc *GroupCoordinator) RegisterStaticMember(group *ConsumerGroup, member *GroupMember) { | |
| 	if member.GroupInstanceID == nil || *member.GroupInstanceID == "" { | |
| 		return | |
| 	} | |
| 
 | |
| 	group.Mu.Lock() | |
| 	defer group.Mu.Unlock() | |
| 
 | |
| 	group.StaticMembers[*member.GroupInstanceID] = member.ID | |
| } | |
| 
 | |
| // RegisterStaticMemberLocked registers a static member in the group (assumes group is already locked) | |
| func (gc *GroupCoordinator) RegisterStaticMemberLocked(group *ConsumerGroup, member *GroupMember) { | |
| 	if member.GroupInstanceID == nil || *member.GroupInstanceID == "" { | |
| 		return | |
| 	} | |
| 
 | |
| 	group.StaticMembers[*member.GroupInstanceID] = member.ID | |
| } | |
| 
 | |
| // UnregisterStaticMember removes a static member from the group | |
| func (gc *GroupCoordinator) UnregisterStaticMember(group *ConsumerGroup, instanceID string) { | |
| 	if instanceID == "" { | |
| 		return | |
| 	} | |
| 
 | |
| 	group.Mu.Lock() | |
| 	defer group.Mu.Unlock() | |
| 
 | |
| 	delete(group.StaticMembers, instanceID) | |
| } | |
| 
 | |
| // UnregisterStaticMemberLocked removes a static member from the group (assumes group is already locked) | |
| func (gc *GroupCoordinator) UnregisterStaticMemberLocked(group *ConsumerGroup, instanceID string) { | |
| 	if instanceID == "" { | |
| 		return | |
| 	} | |
| 
 | |
| 	delete(group.StaticMembers, instanceID) | |
| } | |
| 
 | |
| // IsStaticMember checks if a member is using static membership | |
| func (gc *GroupCoordinator) IsStaticMember(member *GroupMember) bool { | |
| 	return member.GroupInstanceID != nil && *member.GroupInstanceID != "" | |
| } | |
| 
 | |
| // GenerateMemberID creates a deterministic member ID based on client info | |
| func (gc *GroupCoordinator) GenerateMemberID(clientID, clientHost string) string { | |
| 	// EXPERIMENT: Use simpler member ID format like real Kafka brokers | |
| 	// Real Kafka uses format like: "consumer-1-uuid" or "consumer-groupId-uuid" | |
| 	hash := fmt.Sprintf("%x", sha256.Sum256([]byte(clientID+"-"+clientHost))) | |
| 	return fmt.Sprintf("consumer-%s", hash[:16]) // Shorter, simpler format | |
| } | |
| 
 | |
| // ValidateSessionTimeout checks if session timeout is within acceptable range | |
| func (gc *GroupCoordinator) ValidateSessionTimeout(timeout int32) bool { | |
| 	return timeout >= gc.sessionTimeoutMin && timeout <= gc.sessionTimeoutMax | |
| } | |
| 
 | |
| // cleanupRoutine periodically cleans up dead groups and expired members | |
| func (gc *GroupCoordinator) cleanupRoutine() { | |
| 	for { | |
| 		select { | |
| 		case <-gc.cleanupTicker.C: | |
| 			gc.performCleanup() | |
| 		case <-gc.stopChan: | |
| 			return | |
| 		} | |
| 	} | |
| } | |
| 
 | |
| // performCleanup removes expired members and empty groups | |
| func (gc *GroupCoordinator) performCleanup() { | |
| 	now := time.Now() | |
| 
 | |
| 	// Use rebalance timeout manager for more sophisticated timeout handling | |
| 	gc.rebalanceTimeoutManager.CheckRebalanceTimeouts() | |
| 
 | |
| 	gc.groupsMu.Lock() | |
| 	defer gc.groupsMu.Unlock() | |
| 
 | |
| 	for groupID, group := range gc.groups { | |
| 		group.Mu.Lock() | |
| 
 | |
| 		// Check for expired members (session timeout) | |
| 		expiredMembers := make([]string, 0) | |
| 		for memberID, member := range group.Members { | |
| 			sessionDuration := time.Duration(member.SessionTimeout) * time.Millisecond | |
| 			timeSinceHeartbeat := now.Sub(member.LastHeartbeat) | |
| 			if timeSinceHeartbeat > sessionDuration { | |
| 				expiredMembers = append(expiredMembers, memberID) | |
| 			} | |
| 		} | |
| 
 | |
| 		// Remove expired members | |
| 		for _, memberID := range expiredMembers { | |
| 			delete(group.Members, memberID) | |
| 			if group.Leader == memberID { | |
| 				group.Leader = "" | |
| 			} | |
| 		} | |
| 
 | |
| 		// Update group state based on member count | |
| 		if len(group.Members) == 0 { | |
| 			if group.State != GroupStateEmpty { | |
| 				group.State = GroupStateEmpty | |
| 				group.Generation++ | |
| 			} | |
| 
 | |
| 			// Mark group for deletion if empty for too long (30 minutes) | |
| 			if now.Sub(group.LastActivity) > 30*time.Minute { | |
| 				group.State = GroupStateDead | |
| 			} | |
| 		} | |
| 
 | |
| 		// Check for stuck rebalances and force completion if necessary | |
| 		maxRebalanceDuration := 10 * time.Minute // Maximum time allowed for rebalancing | |
| 		if gc.rebalanceTimeoutManager.IsRebalanceStuck(group, maxRebalanceDuration) { | |
| 			gc.rebalanceTimeoutManager.ForceCompleteRebalance(group) | |
| 		} | |
| 
 | |
| 		group.Mu.Unlock() | |
| 
 | |
| 		// Remove dead groups | |
| 		if group.State == GroupStateDead { | |
| 			delete(gc.groups, groupID) | |
| 		} | |
| 	} | |
| } | |
| 
 | |
| // Close shuts down the group coordinator | |
| func (gc *GroupCoordinator) Close() { | |
| 	gc.stopOnce.Do(func() { | |
| 		close(gc.stopChan) | |
| 		if gc.cleanupTicker != nil { | |
| 			gc.cleanupTicker.Stop() | |
| 		} | |
| 	}) | |
| } | |
| 
 | |
| // GetGroupStats returns statistics about the group coordinator | |
| func (gc *GroupCoordinator) GetGroupStats() map[string]interface{} { | |
| 	gc.groupsMu.RLock() | |
| 	defer gc.groupsMu.RUnlock() | |
| 
 | |
| 	stats := map[string]interface{}{ | |
| 		"total_groups": len(gc.groups), | |
| 		"group_states": make(map[string]int), | |
| 	} | |
| 
 | |
| 	stateCount := make(map[GroupState]int) | |
| 	totalMembers := 0 | |
| 
 | |
| 	for _, group := range gc.groups { | |
| 		group.Mu.RLock() | |
| 		stateCount[group.State]++ | |
| 		totalMembers += len(group.Members) | |
| 		group.Mu.RUnlock() | |
| 	} | |
| 
 | |
| 	stats["total_members"] = totalMembers | |
| 	for state, count := range stateCount { | |
| 		stats["group_states"].(map[string]int)[state.String()] = count | |
| 	} | |
| 
 | |
| 	return stats | |
| } | |
| 
 | |
| // GetRebalanceStatus returns the rebalance status for a specific group | |
| func (gc *GroupCoordinator) GetRebalanceStatus(groupID string) *RebalanceStatus { | |
| 	return gc.rebalanceTimeoutManager.GetRebalanceStatus(groupID) | |
| }
 |