Browse Source

mq(kafka): Debug JoinGroup member ID generation and group instance handling

🎯 CRITICAL DISCOVERY - Multiple Member IDs Issue

 DEBUGGING INSIGHTS:
- First JoinGroup: Member becomes leader (158-byte response) 
- Second JoinGroup: Different member ID, NOT leader (95-byte response) 
- Empty group instance ID for kafka-go compatibility 
- Group state transitions: Empty → PreparingRebalance 

🔍 TECHNICAL FINDINGS:
- Member ID 1: '-unknown-host-1757554570245789000' (leader)
- Member ID 2: '-unknown-host-1757554575247398000' (not leader)
- kafka-go appears to be creating multiple consumer instances
- Group state persists correctly between calls

�� EVIDENCE OF ISSUE:
- 'DEBUG: JoinGroup elected new leader: [member1]'
- 'DEBUG: JoinGroup keeping existing leader: [member1]'
- 'DEBUG: JoinGroup member [member2] is NOT the leader'
- Different response sizes: 158 bytes (leader) vs 95 bytes (member)

🔍 ROOT CAUSE HYPOTHESIS:
kafka-go may be creating multiple consumer instances or retrying
with different member IDs, causing group membership confusion.

IMPACT:
This explains why SyncGroup is never called - kafka-go sees
inconsistent member IDs and retries the entire consumer group
discovery process instead of progressing to SyncGroup.

Next: Investigate member ID generation consistency and group
membership persistence to ensure stable consumer identity.
pull/7231/head
chrislu 2 months ago
parent
commit
1696ddf570
  1. 62
      weed/mq/kafka/protocol/joingroup.go

62
weed/mq/kafka/protocol/joingroup.go

@ -16,9 +16,9 @@ type JoinGroupRequest struct {
GroupID string
SessionTimeout int32
RebalanceTimeout int32
MemberID string // Empty for new members
GroupInstanceID string // Optional static membership
ProtocolType string // "consumer" for regular consumers
MemberID string // Empty for new members
GroupInstanceID string // Optional static membership
ProtocolType string // "consumer" for regular consumers
GroupProtocols []GroupProtocol
}
@ -30,31 +30,31 @@ type GroupProtocol struct {
// JoinGroupResponse represents a JoinGroup response to a Kafka client
type JoinGroupResponse struct {
CorrelationID uint32
ErrorCode int16
GenerationID int32
GroupProtocol string
GroupLeader string
MemberID string
Members []JoinGroupMember // Only populated for group leader
CorrelationID uint32
ErrorCode int16
GenerationID int32
GroupProtocol string
GroupLeader string
MemberID string
Members []JoinGroupMember // Only populated for group leader
}
// JoinGroupMember represents member info sent to group leader
type JoinGroupMember struct {
MemberID string
MemberID string
GroupInstanceID string
Metadata []byte
Metadata []byte
}
// Error codes for JoinGroup
const (
ErrorCodeNone int16 = 0
ErrorCodeInvalidGroupID int16 = 24
ErrorCodeUnknownMemberID int16 = 25
ErrorCodeInvalidSessionTimeout int16 = 26
ErrorCodeRebalanceInProgress int16 = 27
ErrorCodeMemberIDRequired int16 = 79
ErrorCodeFencedInstanceID int16 = 82
ErrorCodeNone int16 = 0
ErrorCodeInvalidGroupID int16 = 24
ErrorCodeUnknownMemberID int16 = 25
ErrorCodeInvalidSessionTimeout int16 = 26
ErrorCodeRebalanceInProgress int16 = 27
ErrorCodeMemberIDRequired int16 = 79
ErrorCodeFencedInstanceID int16 = 82
)
func (h *Handler) handleJoinGroup(correlationID uint32, apiVersion uint16, requestBody []byte) ([]byte, error) {
@ -116,12 +116,14 @@ func (h *Handler) handleJoinGroup(correlationID uint32, apiVersion uint16, reque
}
// Check group state
fmt.Printf("DEBUG: JoinGroup current group state: %s, generation: %d\n", group.State, group.Generation)
switch group.State {
case consumer.GroupStateEmpty, consumer.GroupStateStable:
// Can join or trigger rebalance
if isNewMember || len(group.Members) == 0 {
group.State = consumer.GroupStatePreparingRebalance
group.Generation++
fmt.Printf("DEBUG: JoinGroup transitioned to PreparingRebalance, new generation: %d\n", group.Generation)
}
case consumer.GroupStatePreparingRebalance, consumer.GroupStateCompletingRebalance:
// Rebalance already in progress
@ -192,6 +194,9 @@ func (h *Handler) handleJoinGroup(correlationID uint32, apiVersion uint16, reque
// Select group leader (first member or keep existing if still present)
if group.Leader == "" || group.Members[group.Leader] == nil {
group.Leader = memberID
fmt.Printf("DEBUG: JoinGroup elected new leader: '%s' for group '%s'\n", memberID, request.GroupID)
} else {
fmt.Printf("DEBUG: JoinGroup keeping existing leader: '%s' for group '%s'\n", group.Leader, request.GroupID)
}
// Build response
@ -204,16 +209,23 @@ func (h *Handler) handleJoinGroup(correlationID uint32, apiVersion uint16, reque
MemberID: memberID,
}
fmt.Printf("DEBUG: JoinGroup response - Generation: %d, Protocol: '%s', Leader: '%s', Member: '%s'\n",
response.GenerationID, response.GroupProtocol, response.GroupLeader, response.MemberID)
// If this member is the leader, include all member info
if memberID == group.Leader {
fmt.Printf("DEBUG: JoinGroup member '%s' is the leader, including %d members in response\n", memberID, len(group.Members))
response.Members = make([]JoinGroupMember, 0, len(group.Members))
for _, m := range group.Members {
response.Members = append(response.Members, JoinGroupMember{
MemberID: m.ID,
GroupInstanceID: m.ClientID,
GroupInstanceID: "", // Empty for kafka-go compatibility - static membership not used
Metadata: m.Metadata,
})
fmt.Printf("DEBUG: JoinGroup adding member to response - ID: '%s', Metadata: %d bytes\n", m.ID, len(m.Metadata))
}
} else {
fmt.Printf("DEBUG: JoinGroup member '%s' is NOT the leader (leader is '%s'), empty members array\n", memberID, group.Leader)
}
return h.buildJoinGroupResponse(response), nil
@ -411,10 +423,10 @@ func (h *Handler) updateGroupSubscription(group *consumer.ConsumerGroup) {
// SyncGroupRequest represents a SyncGroup request from a Kafka client
type SyncGroupRequest struct {
GroupID string
GenerationID int32
MemberID string
GroupInstanceID string
GroupID string
GenerationID int32
MemberID string
GroupInstanceID string
GroupAssignments []GroupAssignment // Only from group leader
}
@ -433,7 +445,7 @@ type SyncGroupResponse struct {
// Additional error codes for SyncGroup
const (
ErrorCodeIllegalGeneration int16 = 22
ErrorCodeIllegalGeneration int16 = 22
ErrorCodeInconsistentGroupProtocol int16 = 23
)

Loading…
Cancel
Save