package protocol

import (
	"context"
	"encoding/binary"
	"fmt"
	"hash/crc32"
	"strings"
	"time"

	"github.com/seaweedfs/seaweedfs/weed/glog"
	"github.com/seaweedfs/seaweedfs/weed/mq/kafka/compression"
	"github.com/seaweedfs/seaweedfs/weed/mq/kafka/integration"
	"github.com/seaweedfs/seaweedfs/weed/mq/kafka/schema"
	"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
	"google.golang.org/protobuf/proto"
)

// partitionFetchResult holds the result of fetching from a single partition
type partitionFetchResult struct {
	topicIndex     int
	partitionIndex int
	recordBatch    []byte
	highWaterMark  int64
	errorCode      int16
	fetchDuration  time.Duration
}

func (h *Handler) handleFetch(ctx context.Context, correlationID uint32, apiVersion uint16, requestBody []byte) ([]byte, error) {
	// Parse the Fetch request to get the requested topics and partitions
	fetchRequest, err := h.parseFetchRequest(apiVersion, requestBody)
	if err != nil {
		return nil, fmt.Errorf("parse fetch request: %w", err)
	}

	// Basic long-polling to avoid client busy-looping when there's no data.
	var throttleTimeMs int32 = 0
	// Only long-poll when all referenced topics exist; unknown topics should not block
	allTopicsExist := func() bool {
		for _, topic := range fetchRequest.Topics {
			if !h.seaweedMQHandler.TopicExists(topic.Name) {
				return false
			}
		}
		return true
	}
	hasDataAvailable := func() bool {
		// Check if any requested partition has data available
		// Compare fetch offset with high water mark
		for _, topic := range fetchRequest.Topics {
			if !h.seaweedMQHandler.TopicExists(topic.Name) {
				continue
			}
			for _, partition := range topic.Partitions {
				hwm, err := h.seaweedMQHandler.GetLatestOffset(topic.Name, partition.PartitionID)
				if err != nil {
					continue
				}
				// Normalize fetch offset
				effectiveOffset := partition.FetchOffset
				if effectiveOffset == -2 { // earliest
					effectiveOffset = 0
				} else if effectiveOffset == -1 { // latest
					effectiveOffset = hwm
				}
				// If fetch offset < hwm, data is available
				if effectiveOffset < hwm {
					return true
				}
			}
		}
		return false
	}
	// Long-poll when client requests it via MaxWaitTime and there's no data
	// Even if MinBytes=0, we should honor MaxWaitTime to reduce polling overhead
	maxWaitMs := fetchRequest.MaxWaitTime

	// Long-poll if: (1) client wants to wait (maxWaitMs > 0), (2) no data available, (3) topics exist
	// NOTE: We long-poll even if MinBytes=0, since the client specified a wait time
	hasData := hasDataAvailable()
	topicsExist := allTopicsExist()
	shouldLongPoll := maxWaitMs > 0 && !hasData && topicsExist

	if shouldLongPoll {
		start := time.Now()
		// Use the client's requested wait time (already capped at 1s)
		maxPollTime := time.Duration(maxWaitMs) * time.Millisecond
		deadline := start.Add(maxPollTime)
	pollLoop:
		for time.Now().Before(deadline) {
			// Use context-aware sleep instead of blocking time.Sleep
			select {
			case <-ctx.Done():
				throttleTimeMs = int32(time.Since(start) / time.Millisecond)
				break pollLoop
			case <-time.After(10 * time.Millisecond):
				// Continue with polling
			}
			if hasDataAvailable() {
				break pollLoop
			}
		}
		elapsed := time.Since(start)
		throttleTimeMs = int32(elapsed / time.Millisecond)
	}

	// Build the response
	response := make([]byte, 0, 1024)
	totalAppendedRecordBytes := 0

	// NOTE: Correlation ID is NOT included in the response body
	// The wire protocol layer (writeResponseWithTimeout) writes: [Size][CorrelationID][Body]
	// Kafka clients read the correlation ID separately from the 8-byte header, then read Size-4 bytes of body
	// If we include correlation ID here, clients will see it twice and fail with "4 extra bytes" errors

	// Fetch v1+ has throttle_time_ms at the beginning
	if apiVersion >= 1 {
		throttleBytes := make([]byte, 4)
		binary.BigEndian.PutUint32(throttleBytes, uint32(throttleTimeMs))
		response = append(response, throttleBytes...)
	}

	// Fetch v7+ has error_code and session_id
	if apiVersion >= 7 {
		response = append(response, 0, 0)       // error_code (2 bytes, 0 = no error)
		response = append(response, 0, 0, 0, 0) // session_id (4 bytes, 0 = no session)
	}

	// Check if this version uses flexible format (v12+)
	isFlexible := IsFlexibleVersion(1, apiVersion) // API key 1 = Fetch

	// Topics count - write the actual number of topics in the request
	// Kafka protocol: we MUST return all requested topics in the response (even with empty data)
	topicsCount := len(fetchRequest.Topics)
	if isFlexible {
		// Flexible versions use compact array format (count + 1)
		response = append(response, EncodeUvarint(uint32(topicsCount+1))...)
	} else {
		topicsCountBytes := make([]byte, 4)
		binary.BigEndian.PutUint32(topicsCountBytes, uint32(topicsCount))
		response = append(response, topicsCountBytes...)
	}

	// ====================================================================
	// PERSISTENT PARTITION READERS
	// Use per-connection persistent goroutines that maintain offset position
	// and stream forward, eliminating repeated lookups and reducing broker CPU
	// ====================================================================

	// Get connection context to access persistent partition readers
	connContext := h.getConnectionContextFromRequest(ctx)
	if connContext == nil {
		glog.Errorf("FETCH CORR=%d: Connection context not available - cannot use persistent readers",
			correlationID)
		return nil, fmt.Errorf("connection context not available")
	}

	glog.V(2).Infof("[%s] FETCH CORR=%d: Processing %d topics with %d total partitions",
		connContext.ConnectionID, correlationID, len(fetchRequest.Topics),
		func() int {
			count := 0
			for _, t := range fetchRequest.Topics {
				count += len(t.Partitions)
			}
			return count
		}())

	// Collect results from persistent readers
	// CRITICAL: Dispatch all requests concurrently, then wait for all results in parallel
	// to avoid sequential timeout accumulation
	type pendingFetch struct {
		topicName   string
		partitionID int32
		resultChan  chan *partitionFetchResult
	}

	pending := make([]pendingFetch, 0)
	persistentFetchStart := time.Now()

	// Phase 1: Dispatch all fetch requests to partition readers (non-blocking)
	for _, topic := range fetchRequest.Topics {
		isSchematizedTopic := false
		if h.IsSchemaEnabled() {
			isSchematizedTopic = h.isSchematizedTopic(topic.Name)
		}

		for _, partition := range topic.Partitions {
			key := TopicPartitionKey{Topic: topic.Name, Partition: partition.PartitionID}

			// All topics (including system topics) use persistent readers for in-memory access
			// This enables instant notification and avoids ForceFlush dependencies

			// Get or create persistent reader for this partition
			reader := h.getOrCreatePartitionReader(ctx, connContext, key, partition.FetchOffset)
			if reader == nil {
				// Failed to create reader - add empty pending
				glog.Errorf("[%s] Failed to get/create partition reader for %s[%d]",
					connContext.ConnectionID, topic.Name, partition.PartitionID)
				nilChan := make(chan *partitionFetchResult, 1)
				nilChan <- &partitionFetchResult{errorCode: 3} // UNKNOWN_TOPIC_OR_PARTITION
				pending = append(pending, pendingFetch{
					topicName:   topic.Name,
					partitionID: partition.PartitionID,
					resultChan:  nilChan,
				})
				continue
			}

			// Signal reader to fetch (don't wait for result yet)
			resultChan := make(chan *partitionFetchResult, 1)
			fetchReq := &partitionFetchRequest{
				requestedOffset: partition.FetchOffset,
				maxBytes:        partition.MaxBytes,
				maxWaitMs:       maxWaitMs, // Pass MaxWaitTime from Kafka fetch request
				resultChan:      resultChan,
				isSchematized:   isSchematizedTopic,
				apiVersion:      apiVersion,
			}

			// Try to send request (increased timeout for CI environments with slow disk I/O)
			select {
			case reader.fetchChan <- fetchReq:
				// Request sent successfully, add to pending
				pending = append(pending, pendingFetch{
					topicName:   topic.Name,
					partitionID: partition.PartitionID,
					resultChan:  resultChan,
				})
			case <-time.After(200 * time.Millisecond):
				// Channel full, return empty result
				glog.Warningf("[%s] Reader channel full for %s[%d], returning empty",
					connContext.ConnectionID, topic.Name, partition.PartitionID)
				emptyChan := make(chan *partitionFetchResult, 1)
				emptyChan <- &partitionFetchResult{}
				pending = append(pending, pendingFetch{
					topicName:   topic.Name,
					partitionID: partition.PartitionID,
					resultChan:  emptyChan,
				})
			}
		}
	}

	// Phase 2: Wait for all results with adequate timeout for CI environments
	// CRITICAL: We MUST return a result for every requested partition or Sarama will error
	results := make([]*partitionFetchResult, len(pending))
	deadline := time.After(500 * time.Millisecond) // 500ms for all partitions (increased for CI disk I/O)

	// Collect results one by one with shared deadline
	for i, pf := range pending {
		select {
		case result := <-pf.resultChan:
			results[i] = result
		case <-deadline:
			// Deadline expired, return empty for this and all remaining partitions
			for j := i; j < len(pending); j++ {
				results[j] = &partitionFetchResult{}
			}
			glog.V(1).Infof("[%s] Fetch deadline expired, returning empty for %d remaining partitions",
				connContext.ConnectionID, len(pending)-i)
			goto done
		case <-ctx.Done():
			// Context cancelled, return empty for remaining
			for j := i; j < len(pending); j++ {
				results[j] = &partitionFetchResult{}
			}
			goto done
		}
	}
done:

	_ = time.Since(persistentFetchStart) // persistentFetchDuration

	// ====================================================================
	// BUILD RESPONSE FROM FETCHED DATA
	// Now assemble the response in the correct order using fetched results
	// ====================================================================

	// CRITICAL: Verify we have results for all requested partitions
	// Sarama requires a response block for EVERY requested partition to avoid ErrIncompleteResponse
	expectedResultCount := 0
	for _, topic := range fetchRequest.Topics {
		expectedResultCount += len(topic.Partitions)
	}
	if len(results) != expectedResultCount {
		glog.Errorf("[%s] Result count mismatch: expected %d, got %d - this will cause ErrIncompleteResponse",
			connContext.ConnectionID, expectedResultCount, len(results))
		// Pad with empty results if needed (safety net - shouldn't happen with fixed code)
		for len(results) < expectedResultCount {
			results = append(results, &partitionFetchResult{})
		}
	}

	// Process each requested topic
	resultIdx := 0
	for _, topic := range fetchRequest.Topics {
		topicNameBytes := []byte(topic.Name)

		// Topic name length and name
		if isFlexible {
			// Flexible versions use compact string format (length + 1)
			response = append(response, EncodeUvarint(uint32(len(topicNameBytes)+1))...)
		} else {
			response = append(response, byte(len(topicNameBytes)>>8), byte(len(topicNameBytes)))
		}
		response = append(response, topicNameBytes...)

		// Partitions count for this topic
		partitionsCount := len(topic.Partitions)
		if isFlexible {
			// Flexible versions use compact array format (count + 1)
			response = append(response, EncodeUvarint(uint32(partitionsCount+1))...)
		} else {
			partitionsCountBytes := make([]byte, 4)
			binary.BigEndian.PutUint32(partitionsCountBytes, uint32(partitionsCount))
			response = append(response, partitionsCountBytes...)
		}

		// Process each requested partition (using pre-fetched results)
		for _, partition := range topic.Partitions {
			// Get the pre-fetched result for this partition
			result := results[resultIdx]
			resultIdx++

			// Partition ID
			partitionIDBytes := make([]byte, 4)
			binary.BigEndian.PutUint32(partitionIDBytes, uint32(partition.PartitionID))
			response = append(response, partitionIDBytes...)

			// Error code (2 bytes) - use the result's error code
			response = append(response, byte(result.errorCode>>8), byte(result.errorCode))

			// Use the pre-fetched high water mark from concurrent fetch
			highWaterMark := result.highWaterMark

			// High water mark (8 bytes)
			highWaterMarkBytes := make([]byte, 8)
			binary.BigEndian.PutUint64(highWaterMarkBytes, uint64(highWaterMark))
			response = append(response, highWaterMarkBytes...)

			// Fetch v4+ has last_stable_offset and log_start_offset
			if apiVersion >= 4 {
				// Last stable offset (8 bytes) - same as high water mark for non-transactional
				response = append(response, highWaterMarkBytes...)
				// Log start offset (8 bytes) - 0 for simplicity
				response = append(response, 0, 0, 0, 0, 0, 0, 0, 0)

				// Aborted transactions count (4 bytes) = 0
				response = append(response, 0, 0, 0, 0)
			}

			// Use the pre-fetched record batch
			recordBatch := result.recordBatch

			// Records size - flexible versions (v12+) use compact format: varint(size+1)
			if isFlexible {
				if len(recordBatch) == 0 {
					response = append(response, 0) // null records = 0 in compact format
				} else {
					response = append(response, EncodeUvarint(uint32(len(recordBatch)+1))...)
				}
			} else {
				// Non-flexible versions use int32(size)
				recordsSizeBytes := make([]byte, 4)
				binary.BigEndian.PutUint32(recordsSizeBytes, uint32(len(recordBatch)))
				response = append(response, recordsSizeBytes...)
			}

			// Records data
			response = append(response, recordBatch...)
			totalAppendedRecordBytes += len(recordBatch)

			// Tagged fields for flexible versions (v12+) after each partition
			if isFlexible {
				response = append(response, 0) // Empty tagged fields
			}
		}

		// Tagged fields for flexible versions (v12+) after each topic
		if isFlexible {
			response = append(response, 0) // Empty tagged fields
		}
	}

	// Tagged fields for flexible versions (v12+) at the end of response
	if isFlexible {
		response = append(response, 0) // Empty tagged fields
	}

	// Verify topics count hasn't been corrupted
	if !isFlexible {
		// Topics count position depends on API version:
		// v0: byte 0 (no throttle_time_ms, no error_code, no session_id)
		// v1-v6: byte 4 (after throttle_time_ms)
		// v7+: byte 10 (after throttle_time_ms, error_code, session_id)
		var topicsCountPos int
		if apiVersion == 0 {
			topicsCountPos = 0
		} else if apiVersion < 7 {
			topicsCountPos = 4
		} else {
			topicsCountPos = 10
		}

		if len(response) >= topicsCountPos+4 {
			actualTopicsCount := binary.BigEndian.Uint32(response[topicsCountPos : topicsCountPos+4])
			if actualTopicsCount != uint32(topicsCount) {
				glog.Errorf("FETCH CORR=%d v%d: Topics count CORRUPTED! Expected %d, found %d at response[%d:%d]=%02x %02x %02x %02x",
					correlationID, apiVersion, topicsCount, actualTopicsCount, topicsCountPos, topicsCountPos+4,
					response[topicsCountPos], response[topicsCountPos+1], response[topicsCountPos+2], response[topicsCountPos+3])
			}
		}
	}

	return response, nil
}

// FetchRequest represents a parsed Kafka Fetch request
type FetchRequest struct {
	ReplicaID      int32
	MaxWaitTime    int32
	MinBytes       int32
	MaxBytes       int32
	IsolationLevel int8
	Topics         []FetchTopic
}

type FetchTopic struct {
	Name       string
	Partitions []FetchPartition
}

type FetchPartition struct {
	PartitionID    int32
	FetchOffset    int64
	LogStartOffset int64
	MaxBytes       int32
}

// parseFetchRequest parses a Kafka Fetch request
func (h *Handler) parseFetchRequest(apiVersion uint16, requestBody []byte) (*FetchRequest, error) {
	if len(requestBody) < 12 {
		return nil, fmt.Errorf("fetch request too short: %d bytes", len(requestBody))
	}

	offset := 0
	request := &FetchRequest{}

	// Check if this version uses flexible format (v12+)
	isFlexible := IsFlexibleVersion(1, apiVersion) // API key 1 = Fetch

	// NOTE: client_id is already handled by HandleConn and stripped from requestBody
	// Request body starts directly with fetch-specific fields

	// Replica ID (4 bytes) - always fixed
	if offset+4 > len(requestBody) {
		return nil, fmt.Errorf("insufficient data for replica_id")
	}
	request.ReplicaID = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4]))
	offset += 4

	// Max wait time (4 bytes) - always fixed
	if offset+4 > len(requestBody) {
		return nil, fmt.Errorf("insufficient data for max_wait_time")
	}
	request.MaxWaitTime = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4]))
	offset += 4

	// Min bytes (4 bytes) - always fixed
	if offset+4 > len(requestBody) {
		return nil, fmt.Errorf("insufficient data for min_bytes")
	}
	request.MinBytes = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4]))
	offset += 4

	// Max bytes (4 bytes) - only in v3+, always fixed
	if apiVersion >= 3 {
		if offset+4 > len(requestBody) {
			return nil, fmt.Errorf("insufficient data for max_bytes")
		}
		request.MaxBytes = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4]))
		offset += 4
	}

	// Isolation level (1 byte) - only in v4+, always fixed
	if apiVersion >= 4 {
		if offset+1 > len(requestBody) {
			return nil, fmt.Errorf("insufficient data for isolation_level")
		}
		request.IsolationLevel = int8(requestBody[offset])
		offset += 1
	}

	// Session ID (4 bytes) and Session Epoch (4 bytes) - only in v7+, always fixed
	if apiVersion >= 7 {
		if offset+8 > len(requestBody) {
			return nil, fmt.Errorf("insufficient data for session_id and epoch")
		}
		offset += 8 // Skip session_id and session_epoch
	}

	// Topics count - flexible uses compact array, non-flexible uses INT32
	var topicsCount int
	if isFlexible {
		// Compact array: length+1 encoded as varint
		length, consumed, err := DecodeCompactArrayLength(requestBody[offset:])
		if err != nil {
			return nil, fmt.Errorf("decode topics compact array: %w", err)
		}
		topicsCount = int(length)
		offset += consumed
	} else {
		// Regular array: INT32 length
		if offset+4 > len(requestBody) {
			return nil, fmt.Errorf("insufficient data for topics count")
		}
		topicsCount = int(binary.BigEndian.Uint32(requestBody[offset : offset+4]))
		offset += 4
	}

	// Parse topics
	request.Topics = make([]FetchTopic, topicsCount)
	for i := 0; i < topicsCount; i++ {
		// Topic name - flexible uses compact string, non-flexible uses STRING (INT16 length)
		var topicName string
		if isFlexible {
			// Compact string: length+1 encoded as varint
			name, consumed, err := DecodeFlexibleString(requestBody[offset:])
			if err != nil {
				return nil, fmt.Errorf("decode topic name compact string: %w", err)
			}
			topicName = name
			offset += consumed
		} else {
			// Regular string: INT16 length + bytes
			if offset+2 > len(requestBody) {
				return nil, fmt.Errorf("insufficient data for topic name length")
			}
			topicNameLength := int(binary.BigEndian.Uint16(requestBody[offset : offset+2]))
			offset += 2

			if offset+topicNameLength > len(requestBody) {
				return nil, fmt.Errorf("insufficient data for topic name")
			}
			topicName = string(requestBody[offset : offset+topicNameLength])
			offset += topicNameLength
		}
		request.Topics[i].Name = topicName

		// Partitions count - flexible uses compact array, non-flexible uses INT32
		var partitionsCount int
		if isFlexible {
			// Compact array: length+1 encoded as varint
			length, consumed, err := DecodeCompactArrayLength(requestBody[offset:])
			if err != nil {
				return nil, fmt.Errorf("decode partitions compact array: %w", err)
			}
			partitionsCount = int(length)
			offset += consumed
		} else {
			// Regular array: INT32 length
			if offset+4 > len(requestBody) {
				return nil, fmt.Errorf("insufficient data for partitions count")
			}
			partitionsCount = int(binary.BigEndian.Uint32(requestBody[offset : offset+4]))
			offset += 4
		}

		// Parse partitions
		request.Topics[i].Partitions = make([]FetchPartition, partitionsCount)
		for j := 0; j < partitionsCount; j++ {
			// Partition ID (4 bytes) - always fixed
			if offset+4 > len(requestBody) {
				return nil, fmt.Errorf("insufficient data for partition ID")
			}
			request.Topics[i].Partitions[j].PartitionID = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4]))
			offset += 4

			// Current leader epoch (4 bytes) - only in v9+, always fixed
			if apiVersion >= 9 {
				if offset+4 > len(requestBody) {
					return nil, fmt.Errorf("insufficient data for current leader epoch")
				}
				offset += 4 // Skip current leader epoch
			}

			// Fetch offset (8 bytes) - always fixed
			if offset+8 > len(requestBody) {
				return nil, fmt.Errorf("insufficient data for fetch offset")
			}
			request.Topics[i].Partitions[j].FetchOffset = int64(binary.BigEndian.Uint64(requestBody[offset : offset+8]))
			offset += 8

			// Log start offset (8 bytes) - only in v5+, always fixed
			if apiVersion >= 5 {
				if offset+8 > len(requestBody) {
					return nil, fmt.Errorf("insufficient data for log start offset")
				}
				request.Topics[i].Partitions[j].LogStartOffset = int64(binary.BigEndian.Uint64(requestBody[offset : offset+8]))
				offset += 8
			}

			// Partition max bytes (4 bytes) - always fixed
			if offset+4 > len(requestBody) {
				return nil, fmt.Errorf("insufficient data for partition max bytes")
			}
			request.Topics[i].Partitions[j].MaxBytes = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4]))
			offset += 4

			// Tagged fields for partition (only in flexible versions v12+)
			if isFlexible {
				_, consumed, err := DecodeTaggedFields(requestBody[offset:])
				if err != nil {
					return nil, fmt.Errorf("decode partition tagged fields: %w", err)
				}
				offset += consumed
			}
		}

		// Tagged fields for topic (only in flexible versions v12+)
		if isFlexible {
			_, consumed, err := DecodeTaggedFields(requestBody[offset:])
			if err != nil {
				return nil, fmt.Errorf("decode topic tagged fields: %w", err)
			}
			offset += consumed
		}
	}

	// Forgotten topics data (only in v7+)
	if apiVersion >= 7 {
		// Skip forgotten topics array - we don't use incremental fetch yet
		var forgottenTopicsCount int
		if isFlexible {
			length, consumed, err := DecodeCompactArrayLength(requestBody[offset:])
			if err != nil {
				return nil, fmt.Errorf("decode forgotten topics compact array: %w", err)
			}
			forgottenTopicsCount = int(length)
			offset += consumed
		} else {
			if offset+4 > len(requestBody) {
				// End of request, no forgotten topics
				return request, nil
			}
			forgottenTopicsCount = int(binary.BigEndian.Uint32(requestBody[offset : offset+4]))
			offset += 4
		}

		// Skip forgotten topics if present
		for i := 0; i < forgottenTopicsCount && offset < len(requestBody); i++ {
			// Skip topic name
			if isFlexible {
				_, consumed, err := DecodeFlexibleString(requestBody[offset:])
				if err != nil {
					break
				}
				offset += consumed
			} else {
				if offset+2 > len(requestBody) {
					break
				}
				nameLen := int(binary.BigEndian.Uint16(requestBody[offset : offset+2]))
				offset += 2 + nameLen
			}

			// Skip partitions array
			if isFlexible {
				length, consumed, err := DecodeCompactArrayLength(requestBody[offset:])
				if err != nil {
					break
				}
				offset += consumed
				// Skip partition IDs (4 bytes each)
				offset += int(length) * 4
			} else {
				if offset+4 > len(requestBody) {
					break
				}
				partCount := int(binary.BigEndian.Uint32(requestBody[offset : offset+4]))
				offset += 4 + partCount*4
			}

			// Skip tagged fields if flexible
			if isFlexible {
				_, consumed, err := DecodeTaggedFields(requestBody[offset:])
				if err != nil {
					break
				}
				offset += consumed
			}
		}
	}

	// Rack ID (only in v11+) - optional string
	if apiVersion >= 11 && offset < len(requestBody) {
		if isFlexible {
			_, consumed, err := DecodeFlexibleString(requestBody[offset:])
			if err == nil {
				offset += consumed
			}
		} else {
			if offset+2 <= len(requestBody) {
				rackIDLen := int(binary.BigEndian.Uint16(requestBody[offset : offset+2]))
				if rackIDLen >= 0 && offset+2+rackIDLen <= len(requestBody) {
					offset += 2 + rackIDLen
				}
			}
		}
	}

	// Top-level tagged fields (only in flexible versions v12+)
	if isFlexible && offset < len(requestBody) {
		_, consumed, err := DecodeTaggedFields(requestBody[offset:])
		if err != nil {
			// Don't fail on trailing tagged fields parsing
		} else {
			offset += consumed
		}
	}

	return request, nil
}

// constructRecordBatchFromSMQ creates a Kafka record batch from SeaweedMQ records
func (h *Handler) constructRecordBatchFromSMQ(topicName string, fetchOffset int64, smqRecords []integration.SMQRecord) []byte {
	if len(smqRecords) == 0 {
		return []byte{}
	}

	// Create record batch using the SMQ records
	batch := make([]byte, 0, 512)

	// Record batch header
	baseOffsetBytes := make([]byte, 8)
	binary.BigEndian.PutUint64(baseOffsetBytes, uint64(fetchOffset))
	batch = append(batch, baseOffsetBytes...) // base offset (8 bytes)

	// Calculate batch length (will be filled after we know the size)
	batchLengthPos := len(batch)
	batch = append(batch, 0, 0, 0, 0) // batch length placeholder (4 bytes)

	// Partition leader epoch (4 bytes) - use 0 (real Kafka uses 0, not -1)
	batch = append(batch, 0x00, 0x00, 0x00, 0x00)

	// Magic byte (1 byte) - v2 format
	batch = append(batch, 2)

	// CRC placeholder (4 bytes) - will be calculated later
	crcPos := len(batch)
	batch = append(batch, 0, 0, 0, 0)

	// Attributes (2 bytes) - no compression, etc.
	batch = append(batch, 0, 0)

	// Last offset delta (4 bytes)
	lastOffsetDelta := int32(len(smqRecords) - 1)
	lastOffsetDeltaBytes := make([]byte, 4)
	binary.BigEndian.PutUint32(lastOffsetDeltaBytes, uint32(lastOffsetDelta))
	batch = append(batch, lastOffsetDeltaBytes...)

	// Base timestamp (8 bytes) - convert from nanoseconds to milliseconds for Kafka compatibility
	baseTimestamp := smqRecords[0].GetTimestamp() / 1000000 // Convert nanoseconds to milliseconds
	baseTimestampBytes := make([]byte, 8)
	binary.BigEndian.PutUint64(baseTimestampBytes, uint64(baseTimestamp))
	batch = append(batch, baseTimestampBytes...)

	// Max timestamp (8 bytes) - convert from nanoseconds to milliseconds for Kafka compatibility
	maxTimestamp := baseTimestamp
	if len(smqRecords) > 1 {
		maxTimestamp = smqRecords[len(smqRecords)-1].GetTimestamp() / 1000000 // Convert nanoseconds to milliseconds
	}
	maxTimestampBytes := make([]byte, 8)
	binary.BigEndian.PutUint64(maxTimestampBytes, uint64(maxTimestamp))
	batch = append(batch, maxTimestampBytes...)

	// Producer ID (8 bytes) - use -1 for no producer ID
	batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)

	// Producer epoch (2 bytes) - use -1 for no producer epoch
	batch = append(batch, 0xFF, 0xFF)

	// Base sequence (4 bytes) - use -1 for no base sequence
	batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF)

	// Records count (4 bytes)
	recordCountBytes := make([]byte, 4)
	binary.BigEndian.PutUint32(recordCountBytes, uint32(len(smqRecords)))
	batch = append(batch, recordCountBytes...)

	// Add individual records from SMQ records
	for i, smqRecord := range smqRecords {
		// Build individual record
		recordBytes := make([]byte, 0, 128)

		// Record attributes (1 byte)
		recordBytes = append(recordBytes, 0)

		// Timestamp delta (varint) - calculate from base timestamp (both in milliseconds)
		recordTimestampMs := smqRecord.GetTimestamp() / 1000000 // Convert nanoseconds to milliseconds
		timestampDelta := recordTimestampMs - baseTimestamp     // Both in milliseconds now
		recordBytes = append(recordBytes, encodeVarint(timestampDelta)...)

		// Offset delta (varint)
		offsetDelta := int64(i)
		recordBytes = append(recordBytes, encodeVarint(offsetDelta)...)

		// Key length and key (varint + data) - decode RecordValue to get original Kafka message
		key := h.decodeRecordValueToKafkaMessage(topicName, smqRecord.GetKey())
		if key == nil {
			recordBytes = append(recordBytes, encodeVarint(-1)...) // null key
		} else {
			recordBytes = append(recordBytes, encodeVarint(int64(len(key)))...)
			recordBytes = append(recordBytes, key...)
		}

		// Value length and value (varint + data) - decode RecordValue to get original Kafka message
		value := h.decodeRecordValueToKafkaMessage(topicName, smqRecord.GetValue())

		if value == nil {
			recordBytes = append(recordBytes, encodeVarint(-1)...) // null value
		} else {
			recordBytes = append(recordBytes, encodeVarint(int64(len(value)))...)
			recordBytes = append(recordBytes, value...)
		}

		// Headers count (varint) - 0 headers
		recordBytes = append(recordBytes, encodeVarint(0)...)

		// Prepend record length (varint)
		recordLength := int64(len(recordBytes))
		batch = append(batch, encodeVarint(recordLength)...)
		batch = append(batch, recordBytes...)
	}

	// Fill in the batch length
	batchLength := uint32(len(batch) - batchLengthPos - 4)
	binary.BigEndian.PutUint32(batch[batchLengthPos:batchLengthPos+4], batchLength)

	// Calculate CRC32 for the batch
	// Kafka CRC calculation covers: partition leader epoch + magic + attributes + ... (everything after batch length)
	// Skip: BaseOffset(8) + BatchLength(4) = 12 bytes
	crcData := batch[crcPos+4:] // CRC covers ONLY from attributes (byte 21) onwards // Skip CRC field itself, include rest
	crc := crc32.Checksum(crcData, crc32.MakeTable(crc32.Castagnoli))
	binary.BigEndian.PutUint32(batch[crcPos:crcPos+4], crc)

	return batch
}

// encodeVarint encodes a signed integer using Kafka's varint encoding
func encodeVarint(value int64) []byte {
	// Kafka uses zigzag encoding for signed integers
	zigzag := uint64((value << 1) ^ (value >> 63))

	var buf []byte
	for zigzag >= 0x80 {
		buf = append(buf, byte(zigzag)|0x80)
		zigzag >>= 7
	}
	buf = append(buf, byte(zigzag))
	return buf
}

// reconstructSchematizedMessage reconstructs a schematized message from SMQ RecordValue
func (h *Handler) reconstructSchematizedMessage(recordValue *schema_pb.RecordValue, metadata map[string]string) ([]byte, error) {
	// Only reconstruct if schema management is enabled
	if !h.IsSchemaEnabled() {
		return nil, fmt.Errorf("schema management not enabled")
	}

	// Extract schema information from metadata
	schemaIDStr, exists := metadata["schema_id"]
	if !exists {
		return nil, fmt.Errorf("no schema ID in metadata")
	}

	var schemaID uint32
	if _, err := fmt.Sscanf(schemaIDStr, "%d", &schemaID); err != nil {
		return nil, fmt.Errorf("invalid schema ID: %w", err)
	}

	formatStr, exists := metadata["schema_format"]
	if !exists {
		return nil, fmt.Errorf("no schema format in metadata")
	}

	var format schema.Format
	switch formatStr {
	case "AVRO":
		format = schema.FormatAvro
	case "PROTOBUF":
		format = schema.FormatProtobuf
	case "JSON_SCHEMA":
		format = schema.FormatJSONSchema
	default:
		return nil, fmt.Errorf("unsupported schema format: %s", formatStr)
	}

	// Use schema manager to encode back to original format
	return h.schemaManager.EncodeMessage(recordValue, schemaID, format)
}

// SchematizedRecord holds both key and value for schematized messages
type SchematizedRecord struct {
	Key   []byte
	Value []byte
}

// fetchSchematizedRecords fetches and reconstructs schematized records from SeaweedMQ
func (h *Handler) fetchSchematizedRecords(topicName string, partitionID int32, offset int64, maxBytes int32) ([]*SchematizedRecord, error) {
	glog.Infof("fetchSchematizedRecords: topic=%s partition=%d offset=%d maxBytes=%d", topicName, partitionID, offset, maxBytes)

	// Only proceed when schema feature is toggled on
	if !h.useSchema {
		glog.Infof("fetchSchematizedRecords EARLY RETURN: useSchema=false")
		return []*SchematizedRecord{}, nil
	}

	// Check if SeaweedMQ handler is available when schema feature is in use
	if h.seaweedMQHandler == nil {
		glog.Infof("fetchSchematizedRecords ERROR: seaweedMQHandler is nil")
		return nil, fmt.Errorf("SeaweedMQ handler not available")
	}

	// If schema management isn't fully configured, return empty instead of error
	if !h.IsSchemaEnabled() {
		glog.Infof("fetchSchematizedRecords EARLY RETURN: IsSchemaEnabled()=false")
		return []*SchematizedRecord{}, nil
	}

	// Fetch stored records from SeaweedMQ
	maxRecords := 100 // Reasonable batch size limit
	glog.Infof("fetchSchematizedRecords: calling GetStoredRecords maxRecords=%d", maxRecords)
	smqRecords, err := h.seaweedMQHandler.GetStoredRecords(context.Background(), topicName, partitionID, offset, maxRecords)
	if err != nil {
		glog.Infof("fetchSchematizedRecords ERROR: GetStoredRecords failed: %v", err)
		return nil, fmt.Errorf("failed to fetch SMQ records: %w", err)
	}

	glog.Infof("fetchSchematizedRecords: GetStoredRecords returned %d records", len(smqRecords))
	if len(smqRecords) == 0 {
		return []*SchematizedRecord{}, nil
	}

	var reconstructedRecords []*SchematizedRecord
	totalBytes := int32(0)

	for _, smqRecord := range smqRecords {
		// Check if we've exceeded maxBytes limit
		if maxBytes > 0 && totalBytes >= maxBytes {
			break
		}

		// Try to reconstruct the schematized message value
		reconstructedValue, err := h.reconstructSchematizedMessageFromSMQ(smqRecord)
		if err != nil {
			// Log error but continue with other messages
			Error("Failed to reconstruct schematized message at offset %d: %v", smqRecord.GetOffset(), err)
			continue
		}

		if reconstructedValue != nil {
			// Create SchematizedRecord with both key and reconstructed value
			record := &SchematizedRecord{
				Key:   smqRecord.GetKey(), // Preserve the original key
				Value: reconstructedValue, // Use the reconstructed value
			}
			reconstructedRecords = append(reconstructedRecords, record)
			totalBytes += int32(len(record.Key) + len(record.Value))
		}
	}

	return reconstructedRecords, nil
}

// reconstructSchematizedMessageFromSMQ reconstructs a schematized message from an SMQRecord
func (h *Handler) reconstructSchematizedMessageFromSMQ(smqRecord integration.SMQRecord) ([]byte, error) {
	// Get the stored value (should be a serialized RecordValue)
	valueBytes := smqRecord.GetValue()
	if len(valueBytes) == 0 {
		return nil, fmt.Errorf("empty value in SMQ record")
	}

	// Try to unmarshal as RecordValue
	recordValue := &schema_pb.RecordValue{}
	if err := proto.Unmarshal(valueBytes, recordValue); err != nil {
		// If it's not a RecordValue, it might be a regular Kafka message
		// Return it as-is (non-schematized)
		return valueBytes, nil
	}

	// Extract schema metadata from the RecordValue fields
	metadata := h.extractSchemaMetadataFromRecord(recordValue)
	if len(metadata) == 0 {
		// No schema metadata found, treat as regular message
		return valueBytes, nil
	}

	// Remove Kafka metadata fields to get the original message content
	originalRecord := h.removeKafkaMetadataFields(recordValue)

	// Reconstruct the original Confluent envelope
	return h.reconstructSchematizedMessage(originalRecord, metadata)
}

// extractSchemaMetadataFromRecord extracts schema metadata from RecordValue fields
func (h *Handler) extractSchemaMetadataFromRecord(recordValue *schema_pb.RecordValue) map[string]string {
	metadata := make(map[string]string)

	// Look for schema metadata fields in the record
	if schemaIDField := recordValue.Fields["_schema_id"]; schemaIDField != nil {
		if schemaIDValue := schemaIDField.GetStringValue(); schemaIDValue != "" {
			metadata["schema_id"] = schemaIDValue
		}
	}

	if schemaFormatField := recordValue.Fields["_schema_format"]; schemaFormatField != nil {
		if schemaFormatValue := schemaFormatField.GetStringValue(); schemaFormatValue != "" {
			metadata["schema_format"] = schemaFormatValue
		}
	}

	if schemaSubjectField := recordValue.Fields["_schema_subject"]; schemaSubjectField != nil {
		if schemaSubjectValue := schemaSubjectField.GetStringValue(); schemaSubjectValue != "" {
			metadata["schema_subject"] = schemaSubjectValue
		}
	}

	if schemaVersionField := recordValue.Fields["_schema_version"]; schemaVersionField != nil {
		if schemaVersionValue := schemaVersionField.GetStringValue(); schemaVersionValue != "" {
			metadata["schema_version"] = schemaVersionValue
		}
	}

	return metadata
}

// removeKafkaMetadataFields removes Kafka and schema metadata fields from RecordValue
func (h *Handler) removeKafkaMetadataFields(recordValue *schema_pb.RecordValue) *schema_pb.RecordValue {
	originalRecord := &schema_pb.RecordValue{
		Fields: make(map[string]*schema_pb.Value),
	}

	// Copy all fields except metadata fields
	for key, value := range recordValue.Fields {
		if !h.isMetadataField(key) {
			originalRecord.Fields[key] = value
		}
	}

	return originalRecord
}

// isMetadataField checks if a field is a metadata field that should be excluded from the original message
func (h *Handler) isMetadataField(fieldName string) bool {
	return fieldName == "_kafka_offset" ||
		fieldName == "_kafka_partition" ||
		fieldName == "_kafka_timestamp" ||
		fieldName == "_schema_id" ||
		fieldName == "_schema_format" ||
		fieldName == "_schema_subject" ||
		fieldName == "_schema_version"
}

// createSchematizedRecordBatch creates a Kafka record batch from reconstructed schematized messages
func (h *Handler) createSchematizedRecordBatch(records []*SchematizedRecord, baseOffset int64) []byte {
	if len(records) == 0 {
		// Return empty record batch
		return h.createEmptyRecordBatch(baseOffset)
	}

	// Create individual record entries for the batch
	var recordsData []byte
	currentTimestamp := time.Now().UnixMilli()

	for i, record := range records {
		// Create a record entry (Kafka record format v2) with both key and value
		recordEntry := h.createRecordEntry(record.Key, record.Value, int32(i), currentTimestamp)
		recordsData = append(recordsData, recordEntry...)
	}

	// Apply compression if the data is large enough to benefit
	enableCompression := len(recordsData) > 100
	var compressionType compression.CompressionCodec = compression.None
	var finalRecordsData []byte

	if enableCompression {
		compressed, err := compression.Compress(compression.Gzip, recordsData)
		if err == nil && len(compressed) < len(recordsData) {
			finalRecordsData = compressed
			compressionType = compression.Gzip
		} else {
			finalRecordsData = recordsData
		}
	} else {
		finalRecordsData = recordsData
	}

	// Create the record batch with proper compression and CRC
	batch, err := h.createRecordBatchWithCompressionAndCRC(baseOffset, finalRecordsData, compressionType, int32(len(records)), currentTimestamp)
	if err != nil {
		// Fallback to simple batch creation
		return h.createRecordBatchWithPayload(baseOffset, int32(len(records)), finalRecordsData)
	}

	return batch
}

// createRecordEntry creates a single record entry in Kafka record format v2
func (h *Handler) createRecordEntry(messageKey []byte, messageData []byte, offsetDelta int32, timestamp int64) []byte {
	// Record format v2:
	// - length (varint)
	// - attributes (int8)
	// - timestamp delta (varint)
	// - offset delta (varint)
	// - key length (varint) + key
	// - value length (varint) + value
	// - headers count (varint) + headers

	var record []byte

	// Attributes (1 byte) - no special attributes
	record = append(record, 0)

	// Timestamp delta (varint) - 0 for now (all messages have same timestamp)
	record = append(record, encodeVarint(0)...)

	// Offset delta (varint)
	record = append(record, encodeVarint(int64(offsetDelta))...)

	// Key length (varint) + key
	if messageKey == nil || len(messageKey) == 0 {
		record = append(record, encodeVarint(-1)...) // -1 indicates null key
	} else {
		record = append(record, encodeVarint(int64(len(messageKey)))...)
		record = append(record, messageKey...)
	}

	// Value length (varint) + value
	record = append(record, encodeVarint(int64(len(messageData)))...)
	record = append(record, messageData...)

	// Headers count (varint) - no headers
	record = append(record, encodeVarint(0)...)

	// Prepend the total record length (varint)
	recordLength := encodeVarint(int64(len(record)))
	return append(recordLength, record...)
}

// createRecordBatchWithCompressionAndCRC creates a Kafka record batch with proper compression and CRC
func (h *Handler) createRecordBatchWithCompressionAndCRC(baseOffset int64, recordsData []byte, compressionType compression.CompressionCodec, recordCount int32, baseTimestampMs int64) ([]byte, error) {
	// Create record batch header
	// Validate size to prevent overflow
	const maxBatchSize = 1 << 30 // 1 GB limit
	if len(recordsData) > maxBatchSize-61 {
		return nil, fmt.Errorf("records data too large: %d bytes", len(recordsData))
	}
	batch := make([]byte, 0, len(recordsData)+61) // 61 bytes for header

	// Base offset (8 bytes)
	baseOffsetBytes := make([]byte, 8)
	binary.BigEndian.PutUint64(baseOffsetBytes, uint64(baseOffset))
	batch = append(batch, baseOffsetBytes...)

	// Batch length placeholder (4 bytes) - will be filled later
	batchLengthPos := len(batch)
	batch = append(batch, 0, 0, 0, 0)

	// Partition leader epoch (4 bytes)
	batch = append(batch, 0, 0, 0, 0)

	// Magic byte (1 byte) - version 2
	batch = append(batch, 2)

	// CRC placeholder (4 bytes) - will be calculated later
	crcPos := len(batch)
	batch = append(batch, 0, 0, 0, 0)

	// Attributes (2 bytes) - compression type and other flags
	attributes := int16(compressionType) // Set compression type in lower 3 bits
	attributesBytes := make([]byte, 2)
	binary.BigEndian.PutUint16(attributesBytes, uint16(attributes))
	batch = append(batch, attributesBytes...)

	// Last offset delta (4 bytes)
	lastOffsetDelta := uint32(recordCount - 1)
	lastOffsetDeltaBytes := make([]byte, 4)
	binary.BigEndian.PutUint32(lastOffsetDeltaBytes, lastOffsetDelta)
	batch = append(batch, lastOffsetDeltaBytes...)

	// First timestamp (8 bytes) - use the same timestamp used to build record entries
	firstTimestampBytes := make([]byte, 8)
	binary.BigEndian.PutUint64(firstTimestampBytes, uint64(baseTimestampMs))
	batch = append(batch, firstTimestampBytes...)

	// Max timestamp (8 bytes) - same as first for simplicity
	batch = append(batch, firstTimestampBytes...)

	// Producer ID (8 bytes) - -1 for non-transactional
	batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)

	// Producer epoch (2 bytes) - -1 for non-transactional
	batch = append(batch, 0xFF, 0xFF)

	// Base sequence (4 bytes) - -1 for non-transactional
	batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF)

	// Record count (4 bytes)
	recordCountBytes := make([]byte, 4)
	binary.BigEndian.PutUint32(recordCountBytes, uint32(recordCount))
	batch = append(batch, recordCountBytes...)

	// Records payload (compressed or uncompressed)
	batch = append(batch, recordsData...)

	// Calculate and set batch length (excluding base offset and batch length fields)
	batchLength := len(batch) - 12 // 8 bytes base offset + 4 bytes batch length
	binary.BigEndian.PutUint32(batch[batchLengthPos:batchLengthPos+4], uint32(batchLength))

	// Calculate and set CRC32 over attributes..end (exclude CRC field itself)
	// Kafka uses Castagnoli (CRC-32C) algorithm. CRC covers ONLY from attributes offset (byte 21) onwards.
	// See: DefaultRecordBatch.java computeChecksum() - Crc32C.compute(buffer, ATTRIBUTES_OFFSET, ...)
	crcData := batch[crcPos+4:] // Skip CRC field itself (bytes 17..20) and include the rest
	crc := crc32.Checksum(crcData, crc32.MakeTable(crc32.Castagnoli))
	binary.BigEndian.PutUint32(batch[crcPos:crcPos+4], crc)

	return batch, nil
}

// createEmptyRecordBatch creates an empty Kafka record batch using the new parser
func (h *Handler) createEmptyRecordBatch(baseOffset int64) []byte {
	// Use the new record batch creation function with no compression
	emptyRecords := []byte{}
	batch, err := CreateRecordBatch(baseOffset, emptyRecords, compression.None)
	if err != nil {
		// Fallback to manual creation if there's an error
		return h.createEmptyRecordBatchManual(baseOffset)
	}
	return batch
}

// createEmptyRecordBatchManual creates an empty Kafka record batch manually (fallback)
func (h *Handler) createEmptyRecordBatchManual(baseOffset int64) []byte {
	// Create a minimal empty record batch
	batch := make([]byte, 0, 61) // Standard record batch header size

	// Base offset (8 bytes)
	baseOffsetBytes := make([]byte, 8)
	binary.BigEndian.PutUint64(baseOffsetBytes, uint64(baseOffset))
	batch = append(batch, baseOffsetBytes...)

	// Batch length (4 bytes) - will be filled at the end
	lengthPlaceholder := len(batch)
	batch = append(batch, 0, 0, 0, 0)

	// Partition leader epoch (4 bytes) - 0 for simplicity
	batch = append(batch, 0, 0, 0, 0)

	// Magic byte (1 byte) - version 2
	batch = append(batch, 2)

	// CRC32 (4 bytes) - placeholder, should be calculated
	batch = append(batch, 0, 0, 0, 0)

	// Attributes (2 bytes) - no compression, no transactional
	batch = append(batch, 0, 0)

	// Last offset delta (4 bytes) - 0 for empty batch
	batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF)

	// First timestamp (8 bytes) - current time
	timestamp := time.Now().UnixMilli()
	timestampBytes := make([]byte, 8)
	binary.BigEndian.PutUint64(timestampBytes, uint64(timestamp))
	batch = append(batch, timestampBytes...)

	// Max timestamp (8 bytes) - same as first for empty batch
	batch = append(batch, timestampBytes...)

	// Producer ID (8 bytes) - -1 for non-transactional
	batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)

	// Producer Epoch (2 bytes) - -1 for non-transactional
	batch = append(batch, 0xFF, 0xFF)

	// Base Sequence (4 bytes) - -1 for non-transactional
	batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF)

	// Record count (4 bytes) - 0 for empty batch
	batch = append(batch, 0, 0, 0, 0)

	// Fill in the batch length
	batchLength := len(batch) - 12 // Exclude base offset and length field itself
	binary.BigEndian.PutUint32(batch[lengthPlaceholder:lengthPlaceholder+4], uint32(batchLength))

	return batch
}

// createRecordBatchWithPayload creates a record batch with the given payload
func (h *Handler) createRecordBatchWithPayload(baseOffset int64, recordCount int32, payload []byte) []byte {
	// For Phase 7, create a simplified record batch
	// In Phase 8, this will implement proper Kafka record batch format v2

	batch := h.createEmptyRecordBatch(baseOffset)

	// Update record count
	recordCountOffset := len(batch) - 4
	binary.BigEndian.PutUint32(batch[recordCountOffset:recordCountOffset+4], uint32(recordCount))

	// Append payload (simplified - real implementation would format individual records)
	batch = append(batch, payload...)

	// Update batch length
	batchLength := len(batch) - 12
	binary.BigEndian.PutUint32(batch[8:12], uint32(batchLength))

	return batch
}

// handleSchematizedFetch handles fetch requests for topics with schematized messages
func (h *Handler) handleSchematizedFetch(topicName string, partitionID int32, offset int64, maxBytes int32) ([]byte, error) {
	// Check if this topic uses schema management
	if !h.IsSchemaEnabled() {
		// Fall back to regular fetch handling
		return nil, fmt.Errorf("schema management not enabled")
	}

	// Fetch schematized records from SeaweedMQ
	records, err := h.fetchSchematizedRecords(topicName, partitionID, offset, maxBytes)
	if err != nil {
		return nil, fmt.Errorf("failed to fetch schematized records: %w", err)
	}

	// Create record batch from reconstructed records
	recordBatch := h.createSchematizedRecordBatch(records, offset)

	return recordBatch, nil
}

// isSchematizedTopic checks if a topic uses schema management
func (h *Handler) isSchematizedTopic(topicName string) bool {
	// System topics (_schemas, __consumer_offsets, etc.) should NEVER use schema encoding
	// They have their own internal formats and should be passed through as-is
	if h.isSystemTopic(topicName) {
		return false
	}

	if !h.IsSchemaEnabled() {
		return false
	}

	// Check multiple indicators for schematized topics:

	// Check Confluent Schema Registry naming conventions
	return h.matchesSchemaRegistryConvention(topicName)
}

// matchesSchemaRegistryConvention checks Confluent Schema Registry naming patterns
func (h *Handler) matchesSchemaRegistryConvention(topicName string) bool {
	// Common Schema Registry subject patterns:
	// - topicName-value (for message values)
	// - topicName-key (for message keys)
	// - topicName (direct topic name as subject)

	if len(topicName) > 6 && topicName[len(topicName)-6:] == "-value" {
		return true
	}
	if len(topicName) > 4 && topicName[len(topicName)-4:] == "-key" {
		return true
	}

	// Check if the topic has registered schema subjects in Schema Registry
	// Use standard Kafka naming convention: <topic>-value and <topic>-key
	if h.schemaManager != nil {
		// Check with -value suffix (standard pattern for value schemas)
		latestSchemaValue, err := h.schemaManager.GetLatestSchema(topicName + "-value")
		if err == nil {
			// Since we retrieved schema from registry, ensure topic config is updated
			h.ensureTopicSchemaFromLatestSchema(topicName, latestSchemaValue)
			return true
		}

		// Check with -key suffix (for key schemas)
		latestSchemaKey, err := h.schemaManager.GetLatestSchema(topicName + "-key")
		if err == nil {
			// Since we retrieved key schema from registry, ensure topic config is updated
			h.ensureTopicKeySchemaFromLatestSchema(topicName, latestSchemaKey)
			return true
		}
	}

	return false
}

// getSchemaMetadataForTopic retrieves schema metadata for a topic
func (h *Handler) getSchemaMetadataForTopic(topicName string) (map[string]string, error) {
	if !h.IsSchemaEnabled() {
		return nil, fmt.Errorf("schema management not enabled")
	}

	// Try multiple approaches to get schema metadata from Schema Registry

	// 1. Try to get schema from registry using topic name as subject
	metadata, err := h.getSchemaMetadataFromRegistry(topicName)
	if err == nil {
		return metadata, nil
	}

	// 2. Try with -value suffix (common pattern)
	metadata, err = h.getSchemaMetadataFromRegistry(topicName + "-value")
	if err == nil {
		return metadata, nil
	}

	// 3. Try with -key suffix
	metadata, err = h.getSchemaMetadataFromRegistry(topicName + "-key")
	if err == nil {
		return metadata, nil
	}

	return nil, fmt.Errorf("no schema found in registry for topic %s (tried %s, %s-value, %s-key)", topicName, topicName, topicName, topicName)
}

// getSchemaMetadataFromRegistry retrieves schema metadata from Schema Registry
func (h *Handler) getSchemaMetadataFromRegistry(subject string) (map[string]string, error) {
	if h.schemaManager == nil {
		return nil, fmt.Errorf("schema manager not available")
	}

	// Get latest schema for the subject
	cachedSchema, err := h.schemaManager.GetLatestSchema(subject)
	if err != nil {
		return nil, fmt.Errorf("failed to get schema for subject %s: %w", subject, err)
	}

	// Since we retrieved schema from registry, ensure topic config is updated
	// Extract topic name from subject (remove -key or -value suffix if present)
	topicName := h.extractTopicFromSubject(subject)
	if topicName != "" {
		h.ensureTopicSchemaFromLatestSchema(topicName, cachedSchema)
	}

	// Build metadata map
	// Detect format from schema content
	// Simple format detection - assume Avro for now
	format := schema.FormatAvro

	metadata := map[string]string{
		"schema_id":      fmt.Sprintf("%d", cachedSchema.LatestID),
		"schema_format":  format.String(),
		"schema_subject": subject,
		"schema_version": fmt.Sprintf("%d", cachedSchema.Version),
		"schema_content": cachedSchema.Schema,
	}

	return metadata, nil
}

// ensureTopicSchemaFromLatestSchema ensures topic configuration is updated when latest schema is retrieved
func (h *Handler) ensureTopicSchemaFromLatestSchema(topicName string, latestSchema *schema.CachedSubject) {
	if latestSchema == nil {
		return
	}

	// Convert CachedSubject to CachedSchema format for reuse
	// Note: CachedSubject has different field structure than expected
	cachedSchema := &schema.CachedSchema{
		ID:       latestSchema.LatestID,
		Schema:   latestSchema.Schema,
		Subject:  latestSchema.Subject,
		Version:  latestSchema.Version,
		Format:   schema.FormatAvro, // Default to Avro, could be improved with format detection
		CachedAt: latestSchema.CachedAt,
	}

	// Use existing function to handle the schema update
	h.ensureTopicSchemaFromRegistryCache(topicName, cachedSchema)
}

// extractTopicFromSubject extracts the topic name from a schema registry subject
func (h *Handler) extractTopicFromSubject(subject string) string {
	// Remove common suffixes used in schema registry
	if strings.HasSuffix(subject, "-value") {
		return strings.TrimSuffix(subject, "-value")
	}
	if strings.HasSuffix(subject, "-key") {
		return strings.TrimSuffix(subject, "-key")
	}
	// If no suffix, assume subject name is the topic name
	return subject
}

// ensureTopicKeySchemaFromLatestSchema ensures topic configuration is updated when key schema is retrieved
func (h *Handler) ensureTopicKeySchemaFromLatestSchema(topicName string, latestSchema *schema.CachedSubject) {
	if latestSchema == nil {
		return
	}

	// Convert CachedSubject to CachedSchema format for reuse
	// Note: CachedSubject has different field structure than expected
	cachedSchema := &schema.CachedSchema{
		ID:       latestSchema.LatestID,
		Schema:   latestSchema.Schema,
		Subject:  latestSchema.Subject,
		Version:  latestSchema.Version,
		Format:   schema.FormatAvro, // Default to Avro, could be improved with format detection
		CachedAt: latestSchema.CachedAt,
	}

	// Use existing function to handle the key schema update
	h.ensureTopicKeySchemaFromRegistryCache(topicName, cachedSchema)
}

// decodeRecordValueToKafkaMessage decodes a RecordValue back to the original Kafka message bytes
func (h *Handler) decodeRecordValueToKafkaMessage(topicName string, recordValueBytes []byte) []byte {
	if recordValueBytes == nil {
		return nil
	}

	// CRITICAL FIX: For system topics like _schemas, _consumer_offsets, etc.,
	// return the raw bytes as-is. These topics store Kafka's internal format (Avro, etc.)
	// and should NOT be processed as RecordValue protobuf messages.
	if strings.HasPrefix(topicName, "_") {
		return recordValueBytes
	}

	// Try to unmarshal as RecordValue
	recordValue := &schema_pb.RecordValue{}
	if err := proto.Unmarshal(recordValueBytes, recordValue); err != nil {
		// Not a RecordValue format - this is normal for Avro/JSON/raw Kafka messages
		// Return raw bytes as-is (Kafka consumers expect this)
		return recordValueBytes
	}

	// If schema management is enabled, re-encode the RecordValue to Confluent format
	if h.IsSchemaEnabled() {
		if encodedMsg, err := h.encodeRecordValueToConfluentFormat(topicName, recordValue); err == nil {
			return encodedMsg
		} else {
		}
	}

	// Fallback: convert RecordValue to JSON
	return h.recordValueToJSON(recordValue)
}

// encodeRecordValueToConfluentFormat re-encodes a RecordValue back to Confluent format
func (h *Handler) encodeRecordValueToConfluentFormat(topicName string, recordValue *schema_pb.RecordValue) ([]byte, error) {
	if recordValue == nil {
		return nil, fmt.Errorf("RecordValue is nil")
	}

	// Get schema configuration from topic config
	schemaConfig, err := h.getTopicSchemaConfig(topicName)
	if err != nil {
		return nil, fmt.Errorf("failed to get topic schema config: %w", err)
	}

	// Use schema manager to encode RecordValue back to original format
	encodedBytes, err := h.schemaManager.EncodeMessage(recordValue, schemaConfig.ValueSchemaID, schemaConfig.ValueSchemaFormat)
	if err != nil {
		return nil, fmt.Errorf("failed to encode RecordValue: %w", err)
	}

	return encodedBytes, nil
}

// getTopicSchemaConfig retrieves schema configuration for a topic
func (h *Handler) getTopicSchemaConfig(topicName string) (*TopicSchemaConfig, error) {
	h.topicSchemaConfigMu.RLock()
	defer h.topicSchemaConfigMu.RUnlock()

	if h.topicSchemaConfigs == nil {
		return nil, fmt.Errorf("no schema configuration available for topic: %s", topicName)
	}

	config, exists := h.topicSchemaConfigs[topicName]
	if !exists {
		return nil, fmt.Errorf("no schema configuration found for topic: %s", topicName)
	}

	return config, nil
}

// decodeRecordValueToKafkaKey decodes a key RecordValue back to the original Kafka key bytes
func (h *Handler) decodeRecordValueToKafkaKey(topicName string, keyRecordValueBytes []byte) []byte {
	if keyRecordValueBytes == nil {
		return nil
	}

	// Try to get topic schema config
	schemaConfig, err := h.getTopicSchemaConfig(topicName)
	if err != nil || !schemaConfig.HasKeySchema {
		// No key schema config available, return raw bytes
		return keyRecordValueBytes
	}

	// Try to unmarshal as RecordValue
	recordValue := &schema_pb.RecordValue{}
	if err := proto.Unmarshal(keyRecordValueBytes, recordValue); err != nil {
		// If it's not a RecordValue, return the raw bytes
		return keyRecordValueBytes
	}

	// If key schema management is enabled, re-encode the RecordValue to Confluent format
	if h.IsSchemaEnabled() {
		if encodedKey, err := h.encodeKeyRecordValueToConfluentFormat(topicName, recordValue); err == nil {
			return encodedKey
		}
	}

	// Fallback: convert RecordValue to JSON
	return h.recordValueToJSON(recordValue)
}

// encodeKeyRecordValueToConfluentFormat re-encodes a key RecordValue back to Confluent format
func (h *Handler) encodeKeyRecordValueToConfluentFormat(topicName string, recordValue *schema_pb.RecordValue) ([]byte, error) {
	if recordValue == nil {
		return nil, fmt.Errorf("key RecordValue is nil")
	}

	// Get schema configuration from topic config
	schemaConfig, err := h.getTopicSchemaConfig(topicName)
	if err != nil {
		return nil, fmt.Errorf("failed to get topic schema config: %w", err)
	}

	if !schemaConfig.HasKeySchema {
		return nil, fmt.Errorf("no key schema configured for topic: %s", topicName)
	}

	// Use schema manager to encode RecordValue back to original format
	encodedBytes, err := h.schemaManager.EncodeMessage(recordValue, schemaConfig.KeySchemaID, schemaConfig.KeySchemaFormat)
	if err != nil {
		return nil, fmt.Errorf("failed to encode key RecordValue: %w", err)
	}

	return encodedBytes, nil
}

// recordValueToJSON converts a RecordValue to JSON bytes (fallback)
func (h *Handler) recordValueToJSON(recordValue *schema_pb.RecordValue) []byte {
	if recordValue == nil || recordValue.Fields == nil {
		return []byte("{}")
	}

	// Simple JSON conversion - in a real implementation, this would be more sophisticated
	jsonStr := "{"
	first := true
	for fieldName, fieldValue := range recordValue.Fields {
		if !first {
			jsonStr += ","
		}
		first = false

		jsonStr += fmt.Sprintf(`"%s":`, fieldName)

		switch v := fieldValue.Kind.(type) {
		case *schema_pb.Value_StringValue:
			jsonStr += fmt.Sprintf(`"%s"`, v.StringValue)
		case *schema_pb.Value_BytesValue:
			jsonStr += fmt.Sprintf(`"%s"`, string(v.BytesValue))
		case *schema_pb.Value_Int32Value:
			jsonStr += fmt.Sprintf(`%d`, v.Int32Value)
		case *schema_pb.Value_Int64Value:
			jsonStr += fmt.Sprintf(`%d`, v.Int64Value)
		case *schema_pb.Value_BoolValue:
			jsonStr += fmt.Sprintf(`%t`, v.BoolValue)
		default:
			jsonStr += `null`
		}
	}
	jsonStr += "}"

	return []byte(jsonStr)
}

// fetchPartitionData fetches data for a single partition (called concurrently)
func (h *Handler) fetchPartitionData(
	ctx context.Context,
	topicName string,
	partition FetchPartition,
	apiVersion uint16,
	isSchematizedTopic bool,
) *partitionFetchResult {
	startTime := time.Now()
	result := &partitionFetchResult{}

	// Get the actual high water mark from SeaweedMQ
	highWaterMark, err := h.seaweedMQHandler.GetLatestOffset(topicName, partition.PartitionID)
	if err != nil {
		highWaterMark = 0
	}
	result.highWaterMark = highWaterMark

	// Check if topic exists
	if !h.seaweedMQHandler.TopicExists(topicName) {
		if isSystemTopic(topicName) {
			// Auto-create system topics
			if err := h.createTopicWithSchemaSupport(topicName, 1); err != nil {
				result.errorCode = 3 // UNKNOWN_TOPIC_OR_PARTITION
				result.fetchDuration = time.Since(startTime)
				return result
			}
		} else {
			result.errorCode = 3 // UNKNOWN_TOPIC_OR_PARTITION
			result.fetchDuration = time.Since(startTime)
			return result
		}
	}

	// Normalize special fetch offsets
	effectiveFetchOffset := partition.FetchOffset
	if effectiveFetchOffset < 0 {
		if effectiveFetchOffset == -2 {
			effectiveFetchOffset = 0
		} else if effectiveFetchOffset == -1 {
			effectiveFetchOffset = highWaterMark
		}
	}

	// Fetch records if available
	var recordBatch []byte
	if highWaterMark > effectiveFetchOffset {
		// Use multi-batch fetcher (pass context to respect timeout)
		multiFetcher := NewMultiBatchFetcher(h)
		fetchResult, err := multiFetcher.FetchMultipleBatches(
			ctx,
			topicName,
			partition.PartitionID,
			effectiveFetchOffset,
			highWaterMark,
			partition.MaxBytes,
		)

		if err == nil && fetchResult.TotalSize > 0 {
			recordBatch = fetchResult.RecordBatches
		} else {
			// Fallback to single batch (pass context to respect timeout)
			smqRecords, err := h.seaweedMQHandler.GetStoredRecords(ctx, topicName, partition.PartitionID, effectiveFetchOffset, 10)
			if err == nil && len(smqRecords) > 0 {
				recordBatch = h.constructRecordBatchFromSMQ(topicName, effectiveFetchOffset, smqRecords)
			} else {
				recordBatch = []byte{}
			}
		}
	} else {
		recordBatch = []byte{}
	}

	// Try schematized records if needed and recordBatch is empty
	if isSchematizedTopic && len(recordBatch) == 0 {
		schematizedRecords, err := h.fetchSchematizedRecords(topicName, partition.PartitionID, effectiveFetchOffset, partition.MaxBytes)
		if err == nil && len(schematizedRecords) > 0 {
			schematizedBatch := h.createSchematizedRecordBatch(schematizedRecords, effectiveFetchOffset)
			if len(schematizedBatch) > 0 {
				recordBatch = schematizedBatch
			}
		}
	}

	result.recordBatch = recordBatch
	result.fetchDuration = time.Since(startTime)
	return result
}