seaweedfs/weed/mq/kafka/integration/broker_client_subscribe.go


								package integration


								import (

									"context"

									"fmt"

									"io"

									"time"


									"github.com/seaweedfs/seaweedfs/weed/glog"

									"github.com/seaweedfs/seaweedfs/weed/pb/mq_pb"

									"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"

								)


								// createSubscribeInitMessage creates a subscribe init message with the given parameters

								func createSubscribeInitMessage(topic string, actualPartition *schema_pb.Partition, startOffset int64, offsetType schema_pb.OffsetType, consumerGroup string, consumerID string) *mq_pb.SubscribeMessageRequest {

									return &mq_pb.SubscribeMessageRequest{

										Message: &mq_pb.SubscribeMessageRequest_Init{

											Init: &mq_pb.SubscribeMessageRequest_InitMessage{

												ConsumerGroup: consumerGroup,

												ConsumerId:    consumerID,

												ClientId:      "kafka-gateway",

												Topic: &schema_pb.Topic{

													Namespace: "kafka",

													Name:      topic,

												},

												PartitionOffset: &schema_pb.PartitionOffset{

													Partition:   actualPartition,

													StartTsNs:   0,

													StartOffset: startOffset,

												},

												OffsetType:        offsetType,

												SlidingWindowSize: 10,

											},

										},

									}

								}


								// CreateFreshSubscriber creates a new subscriber session without caching

								// This ensures each fetch gets fresh data from the requested offset

								// consumerGroup and consumerID are passed from Kafka client for proper tracking in SMQ

								func (bc *BrokerClient) CreateFreshSubscriber(topic string, partition int32, startOffset int64, consumerGroup string, consumerID string) (*BrokerSubscriberSession, error) {

									// Use BrokerClient's context so subscriber is cancelled when connection closes

									subscriberCtx, subscriberCancel := context.WithCancel(bc.ctx)


									stream, err := bc.client.SubscribeMessage(subscriberCtx)

									if err != nil {

										return nil, fmt.Errorf("failed to create subscribe stream: %v", err)

									}


									// Get the actual partition assignment from the broker

									actualPartition, err := bc.getActualPartitionAssignment(topic, partition)

									if err != nil {

										return nil, fmt.Errorf("failed to get actual partition assignment for subscribe: %v", err)

									}


									// Use EXACT_OFFSET to read from the specific offset

									offsetType := schema_pb.OffsetType_EXACT_OFFSET


									// Send init message to start subscription with Kafka client's consumer group and ID

									initReq := createSubscribeInitMessage(topic, actualPartition, startOffset, offsetType, consumerGroup, consumerID)


									glog.V(2).Infof("[SUBSCRIBE-INIT] CreateFreshSubscriber sending init: topic=%s partition=%d startOffset=%d offsetType=%v consumerGroup=%s consumerID=%s",

										topic, partition, startOffset, offsetType, consumerGroup, consumerID)


									if err := stream.Send(initReq); err != nil {

										return nil, fmt.Errorf("failed to send subscribe init: %v", err)

									}


									// IMPORTANT: Don't wait for init response here!

									// The broker may send the first data record as the "init response"

									// If we call Recv() here, we'll consume that first record and ReadRecords will block

									// waiting for the second record, causing a 30-second timeout.

									// Instead, let ReadRecords handle all Recv() calls.


									session := &BrokerSubscriberSession{

										Stream:        stream,

										Topic:         topic,

										Partition:     partition,

										StartOffset:   startOffset,

										ConsumerGroup: consumerGroup,

										ConsumerID:    consumerID,

										Ctx:           subscriberCtx,

										Cancel:        subscriberCancel,

									}


									return session, nil

								}


								// GetOrCreateSubscriber gets or creates a subscriber for offset tracking

								func (bc *BrokerClient) GetOrCreateSubscriber(topic string, partition int32, startOffset int64, consumerGroup string, consumerID string) (*BrokerSubscriberSession, error) {

									// Create a temporary session to generate the key

									tempSession := &BrokerSubscriberSession{

										Topic:         topic,

										Partition:     partition,

										ConsumerGroup: consumerGroup,

										ConsumerID:    consumerID,

									}

									key := tempSession.Key()


									bc.subscribersLock.RLock()

									if session, exists := bc.subscribers[key]; exists {

										// Check if we can reuse the existing session

										session.mu.Lock()

										currentOffset := session.StartOffset


										// Check cache to see what offsets are available

										canUseCache := false

										if len(session.consumedRecords) > 0 {

											cacheStartOffset := session.consumedRecords[0].Offset

											cacheEndOffset := session.consumedRecords[len(session.consumedRecords)-1].Offset

											if startOffset >= cacheStartOffset && startOffset <= cacheEndOffset {

												canUseCache = true

											}

										}

										session.mu.Unlock()


										// With seekable broker: Always reuse existing session

										// Any offset mismatch will be handled by FetchRecords via SeekMessage

										// This includes:

										// 1. Forward read: Natural continuation

										// 2. Backward read with cache hit: Serve from cache

										// 3. Backward read without cache: Send seek message to broker

										// No need for stream recreation - broker repositions internally


										bc.subscribersLock.RUnlock()

										if canUseCache {

											glog.V(2).Infof("[FETCH] Reusing session for %s: session at %d, requested %d (cached)",

												key, currentOffset, startOffset)

										} else if startOffset >= currentOffset {

											glog.V(2).Infof("[FETCH] Reusing session for %s: session at %d, requested %d (forward read)",

												key, currentOffset, startOffset)

										} else {

											glog.V(2).Infof("[FETCH] Reusing session for %s: session at %d, requested %d (will seek backward)",

												key, currentOffset, startOffset)

										}

										return session, nil

									}


									// Session doesn't exist - need to create one

									bc.subscribersLock.RUnlock()


									// Create new subscriber stream

									// Need to acquire write lock since we don't have it from the paths above

									bc.subscribersLock.Lock()

									defer bc.subscribersLock.Unlock()


									// Double-check if session was created by another thread while we were acquiring the lock

									if session, exists := bc.subscribers[key]; exists {

										// With seekable broker, always reuse existing session

										// FetchRecords will handle any offset mismatch via seek

										session.mu.Lock()

										existingOffset := session.StartOffset

										session.mu.Unlock()


										glog.V(1).Infof("[FETCH] Session created concurrently at offset %d (requested %d), reusing", existingOffset, startOffset)

										return session, nil

									}


									// Use BrokerClient's context so subscribers are automatically cancelled when connection closes

									// This ensures proper cleanup without artificial timeouts

									subscriberCtx, subscriberCancel := context.WithCancel(bc.ctx)


									stream, err := bc.client.SubscribeMessage(subscriberCtx)

									if err != nil {

										return nil, fmt.Errorf("failed to create subscribe stream: %v", err)

									}


									// Get the actual partition assignment from the broker instead of using Kafka partition mapping

									actualPartition, err := bc.getActualPartitionAssignment(topic, partition)

									if err != nil {

										return nil, fmt.Errorf("failed to get actual partition assignment for subscribe: %v", err)

									}


									// Convert Kafka offset to appropriate SeaweedMQ OffsetType

									var offsetType schema_pb.OffsetType

									var offsetValue int64


									if startOffset == -1 {

										// Kafka offset -1 typically means "latest"

										offsetType = schema_pb.OffsetType_RESET_TO_LATEST

										offsetValue = 0 // Not used with RESET_TO_LATEST

										glog.V(2).Infof("Using RESET_TO_LATEST for Kafka offset -1 (read latest)")

									} else {

										// CRITICAL FIX: Use EXACT_OFFSET to position subscriber at the exact Kafka offset

										// This allows the subscriber to read from both buffer and disk at the correct position

										offsetType = schema_pb.OffsetType_EXACT_OFFSET

										offsetValue = startOffset // Use the exact Kafka offset

										glog.V(2).Infof("Using EXACT_OFFSET for Kafka offset %d (direct positioning)", startOffset)

									}


									glog.V(2).Infof("Creating subscriber for topic=%s partition=%d: Kafka offset %d -> SeaweedMQ %s",

										topic, partition, startOffset, offsetType)


									glog.V(2).Infof("[SUBSCRIBE-INIT] GetOrCreateSubscriber sending init: topic=%s partition=%d startOffset=%d offsetType=%v consumerGroup=%s consumerID=%s",

										topic, partition, offsetValue, offsetType, consumerGroup, consumerID)


									// Send init message using the actual partition structure that the broker allocated

									initReq := createSubscribeInitMessage(topic, actualPartition, offsetValue, offsetType, consumerGroup, consumerID)

									if err := stream.Send(initReq); err != nil {

										return nil, fmt.Errorf("failed to send subscribe init: %v", err)

									}


									session := &BrokerSubscriberSession{

										Topic:         topic,

										Partition:     partition,

										Stream:        stream,

										StartOffset:   startOffset,

										ConsumerGroup: consumerGroup,

										ConsumerID:    consumerID,

										Ctx:           subscriberCtx,

										Cancel:        subscriberCancel,

									}


									bc.subscribers[key] = session

									glog.V(2).Infof("Created subscriber session for %s with context cancellation support", key)

									return session, nil

								}


								// ReadRecordsFromOffset reads records starting from a specific offset

								// If the offset is in cache, returns cached records; otherwise delegates to ReadRecords

								// ctx controls the fetch timeout (should match Kafka fetch request's MaxWaitTime)

								func (bc *BrokerClient) ReadRecordsFromOffset(ctx context.Context, session *BrokerSubscriberSession, requestedOffset int64, maxRecords int) ([]*SeaweedRecord, error) {

									if session == nil {

										return nil, fmt.Errorf("subscriber session cannot be nil")

									}


									session.mu.Lock()


									glog.V(2).Infof("[FETCH] ReadRecordsFromOffset: topic=%s partition=%d requestedOffset=%d sessionOffset=%d maxRecords=%d",

										session.Topic, session.Partition, requestedOffset, session.StartOffset, maxRecords)


									// Check cache first

									if len(session.consumedRecords) > 0 {

										cacheStartOffset := session.consumedRecords[0].Offset

										cacheEndOffset := session.consumedRecords[len(session.consumedRecords)-1].Offset


										if requestedOffset >= cacheStartOffset && requestedOffset <= cacheEndOffset {

											// Found in cache

											startIdx := int(requestedOffset - cacheStartOffset)

											// CRITICAL: Bounds check to prevent race condition where cache is modified between checks

											if startIdx < 0 || startIdx >= len(session.consumedRecords) {

												glog.V(2).Infof("[FETCH] Cache index out of bounds (race condition): startIdx=%d, cache size=%d, falling through to normal read",

													startIdx, len(session.consumedRecords))

												// Cache was modified, fall through to normal read path

											} else {

												endIdx := startIdx + maxRecords

												if endIdx > len(session.consumedRecords) {

													endIdx = len(session.consumedRecords)

												}

												glog.V(2).Infof("[FETCH] Returning %d cached records for %s at offset %d (cache: %d-%d)",

													endIdx-startIdx, session.Key(), requestedOffset, cacheStartOffset, cacheEndOffset)

												// CRITICAL: Capture slice while holding lock to prevent race condition

												// If we unlock before slicing, another goroutine could clear consumedRecords

												result := session.consumedRecords[startIdx:endIdx]

												session.mu.Unlock()

												return result, nil

											}

										} else {

											glog.V(2).Infof("[FETCH] Cache miss for %s: requested=%d, cache=[%d-%d]",

												session.Key(), requestedOffset, cacheStartOffset, cacheEndOffset)

										}

									}


									// Get the current offset atomically for comparison

									currentStartOffset := session.StartOffset

									session.mu.Unlock()


									// With seekable broker: Keep subscriber alive across all requests

									// Schema Registry and other clients expect persistent consumer connections

									//

									// Three scenarios, all handled via seek:

									// 1. requestedOffset < session.StartOffset: Send seek message (backward)

									// 2. requestedOffset == session.StartOffset: Continue reading (no seek needed)

									// 3. requestedOffset > session.StartOffset: Send seek message (forward)

									//

									// The stream persists for the entire consumer session - no recreation needed

									if requestedOffset != currentStartOffset {

										offsetDiff := requestedOffset - currentStartOffset

										seekDirection := "forward"

										if offsetDiff < 0 {

											seekDirection = "backward"

										}


										glog.V(2).Infof("[FETCH] Offset mismatch: %s seek from %d to %d (diff=%d)",

											seekDirection, currentStartOffset, requestedOffset, offsetDiff)


										// Send seek message to reposition stream

										seekMsg := &mq_pb.SubscribeMessageRequest{

											Message: &mq_pb.SubscribeMessageRequest_Seek{

												Seek: &mq_pb.SubscribeMessageRequest_SeekMessage{

													Offset:     requestedOffset,

													OffsetType: schema_pb.OffsetType_EXACT_OFFSET,

												},

											},

										}


										if err := session.Stream.Send(seekMsg); err != nil {

											// Handle graceful shutdown: EOF means stream is closing

											if err == io.EOF {

												glog.V(2).Infof("[FETCH] Stream closing during seek to offset %d, returning empty", requestedOffset)

												return []*SeaweedRecord{}, nil

											}

											return nil, fmt.Errorf("seek to offset %d failed: %v", requestedOffset, err)

										}


										// Update session state after successful seek

										session.mu.Lock()

										session.StartOffset = requestedOffset


										// CRITICAL: Only clear cache if seeking forward past cached data

										// For backward seeks, keep cache to avoid re-reading same data from broker

										shouldClearCache := true

										if len(session.consumedRecords) > 0 {

											cacheStartOffset := session.consumedRecords[0].Offset

											cacheEndOffset := session.consumedRecords[len(session.consumedRecords)-1].Offset

											// Keep cache if seeking to an offset within or before cached range

											if requestedOffset <= cacheEndOffset {

												shouldClearCache = false

												glog.V(2).Infof("[FETCH] Keeping cache after seek to %d (cache: [%d-%d])",

													requestedOffset, cacheStartOffset, cacheEndOffset)

											}

										}

										if shouldClearCache {

											session.consumedRecords = nil

											glog.V(2).Infof("[FETCH] Cleared cache after forward seek to %d", requestedOffset)

										}

										session.mu.Unlock()


										glog.V(2).Infof("[FETCH] Seek to offset %d successful", requestedOffset)

									} else {

										glog.V(2).Infof("[FETCH] Offset match: continuing from offset %d", requestedOffset)

									}


									// Read records from current position

									return bc.ReadRecords(ctx, session, maxRecords)

								}


								// ReadRecords reads available records from the subscriber stream

								// Uses a timeout-based approach to read multiple records without blocking indefinitely

								// ctx controls the fetch timeout (should match Kafka fetch request's MaxWaitTime)

								func (bc *BrokerClient) ReadRecords(ctx context.Context, session *BrokerSubscriberSession, maxRecords int) ([]*SeaweedRecord, error) {

									if session == nil {

										return nil, fmt.Errorf("subscriber session cannot be nil")

									}


									if session.Stream == nil {

										return nil, fmt.Errorf("subscriber session stream cannot be nil")

									}


									// CRITICAL: Lock to prevent concurrent reads from the same stream

									// Multiple Fetch requests may try to read from the same subscriber concurrently,

									// causing the broker to return the same offset repeatedly

									session.mu.Lock()

									defer session.mu.Unlock()


									glog.V(2).Infof("[FETCH] ReadRecords: topic=%s partition=%d startOffset=%d maxRecords=%d",

										session.Topic, session.Partition, session.StartOffset, maxRecords)


									var records []*SeaweedRecord

									currentOffset := session.StartOffset


									// CRITICAL FIX: Return immediately if maxRecords is 0 or negative

									if maxRecords <= 0 {

										return records, nil

									}


									// CRITICAL FIX: Use cached records if available to avoid broker tight loop

									// If we've already consumed these records, return them from cache

									if len(session.consumedRecords) > 0 {

										cacheStartOffset := session.consumedRecords[0].Offset

										cacheEndOffset := session.consumedRecords[len(session.consumedRecords)-1].Offset


										if currentOffset >= cacheStartOffset && currentOffset <= cacheEndOffset {

											// Records are in cache

											glog.V(2).Infof("[FETCH] Returning cached records: requested offset %d is in cache [%d-%d]",

												currentOffset, cacheStartOffset, cacheEndOffset)


											// Find starting index in cache

											startIdx := int(currentOffset - cacheStartOffset)

											if startIdx < 0 || startIdx >= len(session.consumedRecords) {

												glog.Errorf("[FETCH] Cache index out of bounds: startIdx=%d, cache size=%d", startIdx, len(session.consumedRecords))

												return records, nil

											}


											// Return up to maxRecords from cache

											endIdx := startIdx + maxRecords

											if endIdx > len(session.consumedRecords) {

												endIdx = len(session.consumedRecords)

											}


											glog.V(2).Infof("[FETCH] Returning %d cached records from index %d to %d", endIdx-startIdx, startIdx, endIdx-1)

											// CRITICAL: Capture slice result while holding lock (defer will unlock after return)

											result := session.consumedRecords[startIdx:endIdx]

											return result, nil

										}

									}


									// Read first record with timeout (important for empty topics)

									// CRITICAL: For SMQ backend with consumer groups, we need adequate timeout for disk reads

									// When a consumer group resumes from a committed offset, the subscriber may need to:

									// 1. Connect to the broker (network latency)

									// 2. Seek to the correct offset in the log file (disk I/O)

									// 3. Read and deserialize the record (disk I/O)

									// Total latency can be 100-500ms for cold reads from disk

									//

									// CRITICAL: Use the context from the Kafka fetch request

									// The context timeout is set by the caller based on the Kafka fetch request's MaxWaitTime

									// This ensures we wait exactly as long as the client requested, not more or less

									// For in-memory reads (hot path), records arrive in <10ms

									// For low-volume topics (like _schemas), the caller sets longer timeout to keep subscriber alive

									// If no context provided, use a reasonable default timeout

									if ctx == nil {

										var cancel context.CancelFunc

										ctx, cancel = context.WithTimeout(context.Background(), 10*time.Second)

										defer cancel()

									}


									// CRITICAL: Capture stream pointer while holding lock to prevent TOCTOU race

									// If we access session.Stream in the goroutine, it could become nil between check and use

									stream := session.Stream

									if stream == nil {

										glog.V(2).Infof("[FETCH] Stream is nil, cannot read")

										return records, nil

									}


									type recvResult struct {

										resp *mq_pb.SubscribeMessageResponse

										err  error

									}

									recvChan := make(chan recvResult, 1)


									// Try to receive first record using captured stream pointer

									go func() {

										// Recover from panics caused by stream being closed during Recv()

										defer func() {

											if r := recover(); r != nil {

												select {

												case recvChan <- recvResult{resp: nil, err: fmt.Errorf("stream recv panicked: %v", r)}:

												case <-ctx.Done():

												}

											}

										}()

										resp, err := stream.Recv()

										select {

										case recvChan <- recvResult{resp: resp, err: err}:

										case <-ctx.Done():

											// Context cancelled, don't send (avoid blocking)

										}

									}()


									select {

									case result := <-recvChan:

										if result.err != nil {

											glog.V(2).Infof("[FETCH] Stream.Recv() error on first record: %v", result.err)

											return records, nil // Return empty - no error for empty topic

										}


										if dataMsg := result.resp.GetData(); dataMsg != nil {

											record := &SeaweedRecord{

												Key:       dataMsg.Key,

												Value:     dataMsg.Value,

												Timestamp: dataMsg.TsNs,

												Offset:    currentOffset,

											}

											records = append(records, record)

											currentOffset++

											glog.V(2).Infof("[FETCH] Received record: offset=%d, keyLen=%d, valueLen=%d",

												record.Offset, len(record.Key), len(record.Value))


											// CRITICAL: Auto-acknowledge first message immediately for Kafka gateway

											// Kafka uses offset commits (not per-message acks) so we must ack to prevent

											// broker from blocking on in-flight messages waiting for acks that will never come

											ackMsg := &mq_pb.SubscribeMessageRequest{

												Message: &mq_pb.SubscribeMessageRequest_Ack{

													Ack: &mq_pb.SubscribeMessageRequest_AckMessage{

														Key:  dataMsg.Key,

														TsNs: dataMsg.TsNs,

													},

												},

											}

											if err := stream.Send(ackMsg); err != nil {

												glog.V(2).Infof("[FETCH] Failed to send ack for first record offset %d: %v (continuing)", record.Offset, err)

												// Don't fail the fetch if ack fails - continue reading

											}

										}


									case <-ctx.Done():

										// Timeout on first record - topic is empty or no data available

										glog.V(2).Infof("[FETCH] No data available (timeout on first record)")

										return records, nil

									}


									// If we got the first record, try to get more with adaptive timeout

									// CRITICAL: Schema Registry catch-up scenario - give generous timeout for the first batch

									// Schema Registry needs to read multiple records quickly when catching up (e.g., offsets 3-6)

									// The broker may be reading from disk, which introduces 10-20ms delay between records

									//

									// Strategy: Start with generous timeout (1 second) for first 5 records to allow broker

									// to read from disk, then switch to fast mode (100ms) for streaming in-memory data

									consecutiveReads := 0


									for len(records) < maxRecords {

										// Adaptive timeout based on how many records we've already read

										var currentTimeout time.Duration

										if consecutiveReads < 5 {

											// First 5 records: generous timeout for disk reads + network delays

											currentTimeout = 1 * time.Second

										} else {

											// After 5 records: assume we're streaming from memory, use faster timeout

											currentTimeout = 100 * time.Millisecond

										}


										readStart := time.Now()

										// CRITICAL: Use parent context (ctx) to respect client's MaxWaitTime deadline

										// The per-record timeout is combined with the overall fetch deadline

										ctx2, cancel2 := context.WithTimeout(ctx, currentTimeout)

										recvChan2 := make(chan recvResult, 1)


										go func() {

											// Recover from panics caused by stream being closed during Recv()

											defer func() {

												if r := recover(); r != nil {

													select {

													case recvChan2 <- recvResult{resp: nil, err: fmt.Errorf("stream recv panicked: %v", r)}:

													case <-ctx2.Done():

													}

												}

											}()

											// Use captured stream pointer to prevent TOCTOU race

											resp, err := stream.Recv()

											select {

											case recvChan2 <- recvResult{resp: resp, err: err}:

											case <-ctx2.Done():

												// Context cancelled

											}

										}()


										select {

										case result := <-recvChan2:

											cancel2()

											readDuration := time.Since(readStart)


											if result.err != nil {

												glog.V(2).Infof("[FETCH] Stream.Recv() error after %d records: %v", len(records), result.err)

												// Update session offset before returning

												glog.V(2).Infof("[FETCH] Updating %s offset: %d -> %d (error case, read %d records)",

													session.Key(), session.StartOffset, currentOffset, len(records))

												session.StartOffset = currentOffset

												return records, nil

											}


											if dataMsg := result.resp.GetData(); dataMsg != nil {

												record := &SeaweedRecord{

													Key:       dataMsg.Key,

													Value:     dataMsg.Value,

													Timestamp: dataMsg.TsNs,

													Offset:    currentOffset,

												}

												records = append(records, record)

												currentOffset++

												consecutiveReads++ // Track number of successful reads for adaptive timeout


												glog.V(2).Infof("[FETCH] Received record %d: offset=%d, keyLen=%d, valueLen=%d, readTime=%v",

													len(records), record.Offset, len(record.Key), len(record.Value), readDuration)


												// CRITICAL: Auto-acknowledge message immediately for Kafka gateway

												// Kafka uses offset commits (not per-message acks) so we must ack to prevent

												// broker from blocking on in-flight messages waiting for acks that will never come

												ackMsg := &mq_pb.SubscribeMessageRequest{

													Message: &mq_pb.SubscribeMessageRequest_Ack{

														Ack: &mq_pb.SubscribeMessageRequest_AckMessage{

															Key:  dataMsg.Key,

															TsNs: dataMsg.TsNs,

														},

													},

												}

												if err := stream.Send(ackMsg); err != nil {

													glog.V(2).Infof("[FETCH] Failed to send ack for offset %d: %v (continuing)", record.Offset, err)

													// Don't fail the fetch if ack fails - continue reading

												}

											}


										case <-ctx2.Done():

											cancel2()

											// Timeout - return what we have

											glog.V(2).Infof("[FETCH] Read timeout after %d records (waited %v), returning batch", len(records), time.Since(readStart))

											// CRITICAL: Update session offset so next fetch knows where we left off

											glog.V(2).Infof("[FETCH] Updating %s offset: %d -> %d (timeout case, read %d records)",

												session.Key(), session.StartOffset, currentOffset, len(records))

											session.StartOffset = currentOffset

											return records, nil

										}

									}


									glog.V(2).Infof("[FETCH] ReadRecords returning %d records (maxRecords reached)", len(records))

									// Update session offset after successful read

									glog.V(2).Infof("[FETCH] Updating %s offset: %d -> %d (success case, read %d records)",

										session.Key(), session.StartOffset, currentOffset, len(records))

									session.StartOffset = currentOffset


									// CRITICAL: Cache the consumed records to avoid broker tight loop

									// Append new records to cache (keep last 1000 records max for better hit rate)

									session.consumedRecords = append(session.consumedRecords, records...)

									if len(session.consumedRecords) > 1000 {

										// Keep only the most recent 1000 records

										session.consumedRecords = session.consumedRecords[len(session.consumedRecords)-1000:]

									}

									glog.V(2).Infof("[FETCH] Updated cache: now contains %d records", len(session.consumedRecords))


									return records, nil

								}


								// CloseSubscriber closes and removes a subscriber session

								func (bc *BrokerClient) CloseSubscriber(topic string, partition int32, consumerGroup string, consumerID string) {

									tempSession := &BrokerSubscriberSession{

										Topic:         topic,

										Partition:     partition,

										ConsumerGroup: consumerGroup,

										ConsumerID:    consumerID,

									}

									key := tempSession.Key()


									bc.subscribersLock.Lock()

									defer bc.subscribersLock.Unlock()


									if session, exists := bc.subscribers[key]; exists {

										// CRITICAL: Hold session lock while cancelling to prevent race with active Recv() calls

										session.mu.Lock()

										if session.Stream != nil {

											_ = session.Stream.CloseSend()

										}

										if session.Cancel != nil {

											session.Cancel()

										}

										session.mu.Unlock()

										delete(bc.subscribers, key)

										glog.V(2).Infof("[FETCH] Closed subscriber for %s", key)

									}

								}


								// NeedsRestart checks if the subscriber needs to restart to read from the given offset

								// Returns true if:

								// 1. Requested offset is before current position AND not in cache

								// 2. Stream is closed/invalid

								func (bc *BrokerClient) NeedsRestart(session *BrokerSubscriberSession, requestedOffset int64) bool {

									session.mu.Lock()

									defer session.mu.Unlock()


									// Check if stream is still valid

									if session.Stream == nil || session.Ctx == nil {

										return true

									}


									// Check if we can serve from cache

									if len(session.consumedRecords) > 0 {

										cacheStart := session.consumedRecords[0].Offset

										cacheEnd := session.consumedRecords[len(session.consumedRecords)-1].Offset

										if requestedOffset >= cacheStart && requestedOffset <= cacheEnd {

											// Can serve from cache, no restart needed

											return false

										}

									}


									// If requested offset is far behind current position, need restart

									if requestedOffset < session.StartOffset {

										return true

									}


									// Check if we're too far ahead (gap in cache)

									if requestedOffset > session.StartOffset+1000 {

										// Large gap - might be more efficient to restart

										return true

									}


									return false

								}


								// RestartSubscriber restarts an existing subscriber from a new offset

								// This is more efficient than closing and recreating the session

								func (bc *BrokerClient) RestartSubscriber(session *BrokerSubscriberSession, newOffset int64, consumerGroup string, consumerID string) error {

									session.mu.Lock()

									defer session.mu.Unlock()


									glog.V(2).Infof("[FETCH] Restarting subscriber for %s[%d]: from offset %d to %d",

										session.Topic, session.Partition, session.StartOffset, newOffset)


									// Close existing stream

									if session.Stream != nil {

										_ = session.Stream.CloseSend()

									}

									if session.Cancel != nil {

										session.Cancel()

									}


									// Clear cache since we're seeking to a different position

									session.consumedRecords = nil

									session.nextOffsetToRead = newOffset


									// Create new stream from new offset

									subscriberCtx, cancel := context.WithCancel(bc.ctx)


									stream, err := bc.client.SubscribeMessage(subscriberCtx)

									if err != nil {

										cancel()

										return fmt.Errorf("failed to create subscribe stream for restart: %v", err)

									}


									// Get the actual partition assignment

									actualPartition, err := bc.getActualPartitionAssignment(session.Topic, session.Partition)

									if err != nil {

										cancel()

										_ = stream.CloseSend()

										return fmt.Errorf("failed to get actual partition assignment for restart: %v", err)

									}


									// Send init message with new offset

									initReq := createSubscribeInitMessage(session.Topic, actualPartition, newOffset, schema_pb.OffsetType_EXACT_OFFSET, consumerGroup, consumerID)


									if err := stream.Send(initReq); err != nil {

										cancel()

										_ = stream.CloseSend()

										return fmt.Errorf("failed to send subscribe init for restart: %v", err)

									}


									// Update session with new stream and offset

									session.Stream = stream

									session.Cancel = cancel

									session.Ctx = subscriberCtx

									session.StartOffset = newOffset


									glog.V(2).Infof("[FETCH] Successfully restarted subscriber for %s[%d] at offset %d",

										session.Topic, session.Partition, newOffset)


									return nil

								}


								// Seek helper methods for BrokerSubscriberSession


								// SeekToOffset repositions the stream to read from a specific offset

								func (session *BrokerSubscriberSession) SeekToOffset(offset int64) error {

									// Skip seek if already at the requested offset

									session.mu.Lock()

									currentOffset := session.StartOffset

									session.mu.Unlock()


									if currentOffset == offset {

										glog.V(2).Infof("[SEEK] Already at offset %d for %s[%d], skipping seek", offset, session.Topic, session.Partition)

										return nil

									}


									seekMsg := &mq_pb.SubscribeMessageRequest{

										Message: &mq_pb.SubscribeMessageRequest_Seek{

											Seek: &mq_pb.SubscribeMessageRequest_SeekMessage{

												Offset:     offset,

												OffsetType: schema_pb.OffsetType_EXACT_OFFSET,

											},

										},

									}


									if err := session.Stream.Send(seekMsg); err != nil {

										// Handle graceful shutdown

										if err == io.EOF {

											glog.V(2).Infof("[SEEK] Stream closing during seek to offset %d for %s[%d]", offset, session.Topic, session.Partition)

											return nil // Not an error during shutdown

										}

										return fmt.Errorf("seek to offset %d failed: %v", offset, err)

									}


									session.mu.Lock()

									session.StartOffset = offset

									// Only clear cache if seeking forward past cached data

									shouldClearCache := true

									if len(session.consumedRecords) > 0 {

										cacheEndOffset := session.consumedRecords[len(session.consumedRecords)-1].Offset

										if offset <= cacheEndOffset {

											shouldClearCache = false

										}

									}

									if shouldClearCache {

										session.consumedRecords = nil

									}

									session.mu.Unlock()


									glog.V(2).Infof("[SEEK] Seeked to offset %d for %s[%d]", offset, session.Topic, session.Partition)

									return nil

								}


								// SeekToTimestamp repositions the stream to read from messages at or after a specific timestamp

								// timestamp is in nanoseconds since Unix epoch

								// Note: We don't skip this operation even if we think we're at the right position because

								// we can't easily determine the offset corresponding to a timestamp without querying the broker

								func (session *BrokerSubscriberSession) SeekToTimestamp(timestampNs int64) error {

									seekMsg := &mq_pb.SubscribeMessageRequest{

										Message: &mq_pb.SubscribeMessageRequest_Seek{

											Seek: &mq_pb.SubscribeMessageRequest_SeekMessage{

												Offset:     timestampNs,

												OffsetType: schema_pb.OffsetType_EXACT_TS_NS,

											},

										},

									}


									if err := session.Stream.Send(seekMsg); err != nil {

										// Handle graceful shutdown

										if err == io.EOF {

											glog.V(2).Infof("[SEEK] Stream closing during seek to timestamp %d for %s[%d]", timestampNs, session.Topic, session.Partition)

											return nil // Not an error during shutdown

										}

										return fmt.Errorf("seek to timestamp %d failed: %v", timestampNs, err)

									}


									session.mu.Lock()

									// Note: We don't know the exact offset at this timestamp yet

									// It will be updated when we read the first message

									session.consumedRecords = nil

									session.mu.Unlock()


									glog.V(2).Infof("[SEEK] Seeked to timestamp %d for %s[%d]", timestampNs, session.Topic, session.Partition)

									return nil

								}


								// SeekToEarliest repositions the stream to the beginning of the partition

								// Note: We don't skip this operation even if StartOffset == 0 because the broker

								// may have a different notion of "earliest" (e.g., after compaction or retention)

								func (session *BrokerSubscriberSession) SeekToEarliest() error {

									seekMsg := &mq_pb.SubscribeMessageRequest{

										Message: &mq_pb.SubscribeMessageRequest_Seek{

											Seek: &mq_pb.SubscribeMessageRequest_SeekMessage{

												Offset:     0,

												OffsetType: schema_pb.OffsetType_RESET_TO_EARLIEST,

											},

										},

									}


									if err := session.Stream.Send(seekMsg); err != nil {

										// Handle graceful shutdown

										if err == io.EOF {

											glog.V(2).Infof("[SEEK] Stream closing during seek to earliest for %s[%d]", session.Topic, session.Partition)

											return nil // Not an error during shutdown

										}

										return fmt.Errorf("seek to earliest failed: %v", err)

									}


									session.mu.Lock()

									session.StartOffset = 0

									session.consumedRecords = nil

									session.mu.Unlock()


									glog.V(2).Infof("[SEEK] Seeked to earliest for %s[%d]", session.Topic, session.Partition)

									return nil

								}


								// SeekToLatest repositions the stream to the end of the partition (next new message)

								// Note: We don't skip this operation because "latest" is a moving target and we can't

								// reliably determine if we're already at the latest position without querying the broker

								func (session *BrokerSubscriberSession) SeekToLatest() error {

									seekMsg := &mq_pb.SubscribeMessageRequest{

										Message: &mq_pb.SubscribeMessageRequest_Seek{

											Seek: &mq_pb.SubscribeMessageRequest_SeekMessage{

												Offset:     0,

												OffsetType: schema_pb.OffsetType_RESET_TO_LATEST,

											},

										},

									}


									if err := session.Stream.Send(seekMsg); err != nil {

										// Handle graceful shutdown

										if err == io.EOF {

											glog.V(2).Infof("[SEEK] Stream closing during seek to latest for %s[%d]", session.Topic, session.Partition)

											return nil // Not an error during shutdown

										}

										return fmt.Errorf("seek to latest failed: %v", err)

									}


									session.mu.Lock()

									// Offset will be set when we read the first new message

									session.consumedRecords = nil

									session.mu.Unlock()


									glog.V(2).Infof("[SEEK] Seeked to latest for %s[%d]", session.Topic, session.Partition)

									return nil

								}