package protocol import ( "context" "encoding/binary" "fmt" "hash/crc32" "strings" "time" "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/mq/kafka/compression" "github.com/seaweedfs/seaweedfs/weed/mq/kafka/integration" "github.com/seaweedfs/seaweedfs/weed/mq/kafka/schema" "github.com/seaweedfs/seaweedfs/weed/pb/schema_pb" "google.golang.org/protobuf/proto" ) // partitionFetchResult holds the result of fetching from a single partition type partitionFetchResult struct { topicIndex int partitionIndex int recordBatch []byte highWaterMark int64 errorCode int16 fetchDuration time.Duration } func (h *Handler) handleFetch(ctx context.Context, correlationID uint32, apiVersion uint16, requestBody []byte) ([]byte, error) { // Parse the Fetch request to get the requested topics and partitions fetchRequest, err := h.parseFetchRequest(apiVersion, requestBody) if err != nil { return nil, fmt.Errorf("parse fetch request: %w", err) } // Basic long-polling to avoid client busy-looping when there's no data. var throttleTimeMs int32 = 0 // Only long-poll when all referenced topics exist; unknown topics should not block allTopicsExist := func() bool { for _, topic := range fetchRequest.Topics { if !h.seaweedMQHandler.TopicExists(topic.Name) { return false } } return true } hasDataAvailable := func() bool { // Check if any requested partition has data available // Compare fetch offset with high water mark for _, topic := range fetchRequest.Topics { if !h.seaweedMQHandler.TopicExists(topic.Name) { continue } for _, partition := range topic.Partitions { hwm, err := h.seaweedMQHandler.GetLatestOffset(topic.Name, partition.PartitionID) if err != nil { continue } // Normalize fetch offset effectiveOffset := partition.FetchOffset if effectiveOffset == -2 { // earliest effectiveOffset = 0 } else if effectiveOffset == -1 { // latest effectiveOffset = hwm } // If fetch offset < hwm, data is available if effectiveOffset < hwm { return true } } } return false } // Long-poll when client requests it via MaxWaitTime and there's no data // Even if MinBytes=0, we should honor MaxWaitTime to reduce polling overhead maxWaitMs := fetchRequest.MaxWaitTime // Long-poll if: (1) client wants to wait (maxWaitMs > 0), (2) no data available, (3) topics exist // NOTE: We long-poll even if MinBytes=0, since the client specified a wait time hasData := hasDataAvailable() topicsExist := allTopicsExist() shouldLongPoll := maxWaitMs > 0 && !hasData && topicsExist if shouldLongPoll { start := time.Now() // Use the client's requested wait time (already capped at 1s) maxPollTime := time.Duration(maxWaitMs) * time.Millisecond deadline := start.Add(maxPollTime) pollLoop: for time.Now().Before(deadline) { // Use context-aware sleep instead of blocking time.Sleep select { case <-ctx.Done(): throttleTimeMs = int32(time.Since(start) / time.Millisecond) break pollLoop case <-time.After(10 * time.Millisecond): // Continue with polling } if hasDataAvailable() { break pollLoop } } elapsed := time.Since(start) throttleTimeMs = int32(elapsed / time.Millisecond) } // Build the response response := make([]byte, 0, 1024) totalAppendedRecordBytes := 0 // NOTE: Correlation ID is NOT included in the response body // The wire protocol layer (writeResponseWithTimeout) writes: [Size][CorrelationID][Body] // Kafka clients read the correlation ID separately from the 8-byte header, then read Size-4 bytes of body // If we include correlation ID here, clients will see it twice and fail with "4 extra bytes" errors // Fetch v1+ has throttle_time_ms at the beginning if apiVersion >= 1 { throttleBytes := make([]byte, 4) binary.BigEndian.PutUint32(throttleBytes, uint32(throttleTimeMs)) response = append(response, throttleBytes...) } // Fetch v7+ has error_code and session_id if apiVersion >= 7 { response = append(response, 0, 0) // error_code (2 bytes, 0 = no error) response = append(response, 0, 0, 0, 0) // session_id (4 bytes, 0 = no session) } // Check if this version uses flexible format (v12+) isFlexible := IsFlexibleVersion(1, apiVersion) // API key 1 = Fetch // Topics count - write the actual number of topics in the request // Kafka protocol: we MUST return all requested topics in the response (even with empty data) topicsCount := len(fetchRequest.Topics) if isFlexible { // Flexible versions use compact array format (count + 1) response = append(response, EncodeUvarint(uint32(topicsCount+1))...) } else { topicsCountBytes := make([]byte, 4) binary.BigEndian.PutUint32(topicsCountBytes, uint32(topicsCount)) response = append(response, topicsCountBytes...) } // ==================================================================== // PERSISTENT PARTITION READERS // Use per-connection persistent goroutines that maintain offset position // and stream forward, eliminating repeated lookups and reducing broker CPU // ==================================================================== // Get connection context to access persistent partition readers connContext := h.getConnectionContextFromRequest(ctx) if connContext == nil { glog.Errorf("FETCH CORR=%d: Connection context not available - cannot use persistent readers", correlationID) return nil, fmt.Errorf("connection context not available") } glog.V(2).Infof("[%s] FETCH CORR=%d: Processing %d topics with %d total partitions", connContext.ConnectionID, correlationID, len(fetchRequest.Topics), func() int { count := 0 for _, t := range fetchRequest.Topics { count += len(t.Partitions) } return count }()) // Collect results from persistent readers // CRITICAL: Dispatch all requests concurrently, then wait for all results in parallel // to avoid sequential timeout accumulation type pendingFetch struct { topicName string partitionID int32 resultChan chan *partitionFetchResult } pending := make([]pendingFetch, 0) persistentFetchStart := time.Now() // Phase 1: Dispatch all fetch requests to partition readers (non-blocking) for _, topic := range fetchRequest.Topics { isSchematizedTopic := false if h.IsSchemaEnabled() { isSchematizedTopic = h.isSchematizedTopic(topic.Name) } for _, partition := range topic.Partitions { key := TopicPartitionKey{Topic: topic.Name, Partition: partition.PartitionID} // All topics (including system topics) use persistent readers for in-memory access // This enables instant notification and avoids ForceFlush dependencies // Get or create persistent reader for this partition reader := h.getOrCreatePartitionReader(ctx, connContext, key, partition.FetchOffset) if reader == nil { // Failed to create reader - add empty pending glog.Errorf("[%s] Failed to get/create partition reader for %s[%d]", connContext.ConnectionID, topic.Name, partition.PartitionID) nilChan := make(chan *partitionFetchResult, 1) nilChan <- &partitionFetchResult{errorCode: 3} // UNKNOWN_TOPIC_OR_PARTITION pending = append(pending, pendingFetch{ topicName: topic.Name, partitionID: partition.PartitionID, resultChan: nilChan, }) continue } // Signal reader to fetch (don't wait for result yet) resultChan := make(chan *partitionFetchResult, 1) fetchReq := &partitionFetchRequest{ requestedOffset: partition.FetchOffset, maxBytes: partition.MaxBytes, maxWaitMs: maxWaitMs, // Pass MaxWaitTime from Kafka fetch request resultChan: resultChan, isSchematized: isSchematizedTopic, apiVersion: apiVersion, } // Try to send request (increased timeout for CI environments with slow disk I/O) select { case reader.fetchChan <- fetchReq: // Request sent successfully, add to pending pending = append(pending, pendingFetch{ topicName: topic.Name, partitionID: partition.PartitionID, resultChan: resultChan, }) case <-time.After(200 * time.Millisecond): // Channel full, return empty result glog.Warningf("[%s] Reader channel full for %s[%d], returning empty", connContext.ConnectionID, topic.Name, partition.PartitionID) emptyChan := make(chan *partitionFetchResult, 1) emptyChan <- &partitionFetchResult{} pending = append(pending, pendingFetch{ topicName: topic.Name, partitionID: partition.PartitionID, resultChan: emptyChan, }) } } } // Phase 2: Wait for all results with adequate timeout for CI environments // CRITICAL: We MUST return a result for every requested partition or Sarama will error results := make([]*partitionFetchResult, len(pending)) deadline := time.After(500 * time.Millisecond) // 500ms for all partitions (increased for CI disk I/O) // Collect results one by one with shared deadline for i, pf := range pending { select { case result := <-pf.resultChan: results[i] = result case <-deadline: // Deadline expired, return empty for this and all remaining partitions for j := i; j < len(pending); j++ { results[j] = &partitionFetchResult{} } glog.V(1).Infof("[%s] Fetch deadline expired, returning empty for %d remaining partitions", connContext.ConnectionID, len(pending)-i) goto done case <-ctx.Done(): // Context cancelled, return empty for remaining for j := i; j < len(pending); j++ { results[j] = &partitionFetchResult{} } goto done } } done: _ = time.Since(persistentFetchStart) // persistentFetchDuration // ==================================================================== // BUILD RESPONSE FROM FETCHED DATA // Now assemble the response in the correct order using fetched results // ==================================================================== // CRITICAL: Verify we have results for all requested partitions // Sarama requires a response block for EVERY requested partition to avoid ErrIncompleteResponse expectedResultCount := 0 for _, topic := range fetchRequest.Topics { expectedResultCount += len(topic.Partitions) } if len(results) != expectedResultCount { glog.Errorf("[%s] Result count mismatch: expected %d, got %d - this will cause ErrIncompleteResponse", connContext.ConnectionID, expectedResultCount, len(results)) // Pad with empty results if needed (safety net - shouldn't happen with fixed code) for len(results) < expectedResultCount { results = append(results, &partitionFetchResult{}) } } // Process each requested topic resultIdx := 0 for _, topic := range fetchRequest.Topics { topicNameBytes := []byte(topic.Name) // Topic name length and name if isFlexible { // Flexible versions use compact string format (length + 1) response = append(response, EncodeUvarint(uint32(len(topicNameBytes)+1))...) } else { response = append(response, byte(len(topicNameBytes)>>8), byte(len(topicNameBytes))) } response = append(response, topicNameBytes...) // Partitions count for this topic partitionsCount := len(topic.Partitions) if isFlexible { // Flexible versions use compact array format (count + 1) response = append(response, EncodeUvarint(uint32(partitionsCount+1))...) } else { partitionsCountBytes := make([]byte, 4) binary.BigEndian.PutUint32(partitionsCountBytes, uint32(partitionsCount)) response = append(response, partitionsCountBytes...) } // Process each requested partition (using pre-fetched results) for _, partition := range topic.Partitions { // Get the pre-fetched result for this partition result := results[resultIdx] resultIdx++ // Partition ID partitionIDBytes := make([]byte, 4) binary.BigEndian.PutUint32(partitionIDBytes, uint32(partition.PartitionID)) response = append(response, partitionIDBytes...) // Error code (2 bytes) - use the result's error code response = append(response, byte(result.errorCode>>8), byte(result.errorCode)) // Use the pre-fetched high water mark from concurrent fetch highWaterMark := result.highWaterMark // High water mark (8 bytes) highWaterMarkBytes := make([]byte, 8) binary.BigEndian.PutUint64(highWaterMarkBytes, uint64(highWaterMark)) response = append(response, highWaterMarkBytes...) // Fetch v4+ has last_stable_offset and log_start_offset if apiVersion >= 4 { // Last stable offset (8 bytes) - same as high water mark for non-transactional response = append(response, highWaterMarkBytes...) // Log start offset (8 bytes) - 0 for simplicity response = append(response, 0, 0, 0, 0, 0, 0, 0, 0) // Aborted transactions count (4 bytes) = 0 response = append(response, 0, 0, 0, 0) } // Use the pre-fetched record batch recordBatch := result.recordBatch // Records size - flexible versions (v12+) use compact format: varint(size+1) if isFlexible { if len(recordBatch) == 0 { response = append(response, 0) // null records = 0 in compact format } else { response = append(response, EncodeUvarint(uint32(len(recordBatch)+1))...) } } else { // Non-flexible versions use int32(size) recordsSizeBytes := make([]byte, 4) binary.BigEndian.PutUint32(recordsSizeBytes, uint32(len(recordBatch))) response = append(response, recordsSizeBytes...) } // Records data response = append(response, recordBatch...) totalAppendedRecordBytes += len(recordBatch) // Tagged fields for flexible versions (v12+) after each partition if isFlexible { response = append(response, 0) // Empty tagged fields } } // Tagged fields for flexible versions (v12+) after each topic if isFlexible { response = append(response, 0) // Empty tagged fields } } // Tagged fields for flexible versions (v12+) at the end of response if isFlexible { response = append(response, 0) // Empty tagged fields } // Verify topics count hasn't been corrupted if !isFlexible { // Topics count position depends on API version: // v0: byte 0 (no throttle_time_ms, no error_code, no session_id) // v1-v6: byte 4 (after throttle_time_ms) // v7+: byte 10 (after throttle_time_ms, error_code, session_id) var topicsCountPos int if apiVersion == 0 { topicsCountPos = 0 } else if apiVersion < 7 { topicsCountPos = 4 } else { topicsCountPos = 10 } if len(response) >= topicsCountPos+4 { actualTopicsCount := binary.BigEndian.Uint32(response[topicsCountPos : topicsCountPos+4]) if actualTopicsCount != uint32(topicsCount) { glog.Errorf("FETCH CORR=%d v%d: Topics count CORRUPTED! Expected %d, found %d at response[%d:%d]=%02x %02x %02x %02x", correlationID, apiVersion, topicsCount, actualTopicsCount, topicsCountPos, topicsCountPos+4, response[topicsCountPos], response[topicsCountPos+1], response[topicsCountPos+2], response[topicsCountPos+3]) } } } return response, nil } // FetchRequest represents a parsed Kafka Fetch request type FetchRequest struct { ReplicaID int32 MaxWaitTime int32 MinBytes int32 MaxBytes int32 IsolationLevel int8 Topics []FetchTopic } type FetchTopic struct { Name string Partitions []FetchPartition } type FetchPartition struct { PartitionID int32 FetchOffset int64 LogStartOffset int64 MaxBytes int32 } // parseFetchRequest parses a Kafka Fetch request func (h *Handler) parseFetchRequest(apiVersion uint16, requestBody []byte) (*FetchRequest, error) { if len(requestBody) < 12 { return nil, fmt.Errorf("fetch request too short: %d bytes", len(requestBody)) } offset := 0 request := &FetchRequest{} // Check if this version uses flexible format (v12+) isFlexible := IsFlexibleVersion(1, apiVersion) // API key 1 = Fetch // NOTE: client_id is already handled by HandleConn and stripped from requestBody // Request body starts directly with fetch-specific fields // Replica ID (4 bytes) - always fixed if offset+4 > len(requestBody) { return nil, fmt.Errorf("insufficient data for replica_id") } request.ReplicaID = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4])) offset += 4 // Max wait time (4 bytes) - always fixed if offset+4 > len(requestBody) { return nil, fmt.Errorf("insufficient data for max_wait_time") } request.MaxWaitTime = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4])) offset += 4 // Min bytes (4 bytes) - always fixed if offset+4 > len(requestBody) { return nil, fmt.Errorf("insufficient data for min_bytes") } request.MinBytes = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4])) offset += 4 // Max bytes (4 bytes) - only in v3+, always fixed if apiVersion >= 3 { if offset+4 > len(requestBody) { return nil, fmt.Errorf("insufficient data for max_bytes") } request.MaxBytes = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4])) offset += 4 } // Isolation level (1 byte) - only in v4+, always fixed if apiVersion >= 4 { if offset+1 > len(requestBody) { return nil, fmt.Errorf("insufficient data for isolation_level") } request.IsolationLevel = int8(requestBody[offset]) offset += 1 } // Session ID (4 bytes) and Session Epoch (4 bytes) - only in v7+, always fixed if apiVersion >= 7 { if offset+8 > len(requestBody) { return nil, fmt.Errorf("insufficient data for session_id and epoch") } offset += 8 // Skip session_id and session_epoch } // Topics count - flexible uses compact array, non-flexible uses INT32 var topicsCount int if isFlexible { // Compact array: length+1 encoded as varint length, consumed, err := DecodeCompactArrayLength(requestBody[offset:]) if err != nil { return nil, fmt.Errorf("decode topics compact array: %w", err) } topicsCount = int(length) offset += consumed } else { // Regular array: INT32 length if offset+4 > len(requestBody) { return nil, fmt.Errorf("insufficient data for topics count") } topicsCount = int(binary.BigEndian.Uint32(requestBody[offset : offset+4])) offset += 4 } // Parse topics request.Topics = make([]FetchTopic, topicsCount) for i := 0; i < topicsCount; i++ { // Topic name - flexible uses compact string, non-flexible uses STRING (INT16 length) var topicName string if isFlexible { // Compact string: length+1 encoded as varint name, consumed, err := DecodeFlexibleString(requestBody[offset:]) if err != nil { return nil, fmt.Errorf("decode topic name compact string: %w", err) } topicName = name offset += consumed } else { // Regular string: INT16 length + bytes if offset+2 > len(requestBody) { return nil, fmt.Errorf("insufficient data for topic name length") } topicNameLength := int(binary.BigEndian.Uint16(requestBody[offset : offset+2])) offset += 2 if offset+topicNameLength > len(requestBody) { return nil, fmt.Errorf("insufficient data for topic name") } topicName = string(requestBody[offset : offset+topicNameLength]) offset += topicNameLength } request.Topics[i].Name = topicName // Partitions count - flexible uses compact array, non-flexible uses INT32 var partitionsCount int if isFlexible { // Compact array: length+1 encoded as varint length, consumed, err := DecodeCompactArrayLength(requestBody[offset:]) if err != nil { return nil, fmt.Errorf("decode partitions compact array: %w", err) } partitionsCount = int(length) offset += consumed } else { // Regular array: INT32 length if offset+4 > len(requestBody) { return nil, fmt.Errorf("insufficient data for partitions count") } partitionsCount = int(binary.BigEndian.Uint32(requestBody[offset : offset+4])) offset += 4 } // Parse partitions request.Topics[i].Partitions = make([]FetchPartition, partitionsCount) for j := 0; j < partitionsCount; j++ { // Partition ID (4 bytes) - always fixed if offset+4 > len(requestBody) { return nil, fmt.Errorf("insufficient data for partition ID") } request.Topics[i].Partitions[j].PartitionID = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4])) offset += 4 // Current leader epoch (4 bytes) - only in v9+, always fixed if apiVersion >= 9 { if offset+4 > len(requestBody) { return nil, fmt.Errorf("insufficient data for current leader epoch") } offset += 4 // Skip current leader epoch } // Fetch offset (8 bytes) - always fixed if offset+8 > len(requestBody) { return nil, fmt.Errorf("insufficient data for fetch offset") } request.Topics[i].Partitions[j].FetchOffset = int64(binary.BigEndian.Uint64(requestBody[offset : offset+8])) offset += 8 // Log start offset (8 bytes) - only in v5+, always fixed if apiVersion >= 5 { if offset+8 > len(requestBody) { return nil, fmt.Errorf("insufficient data for log start offset") } request.Topics[i].Partitions[j].LogStartOffset = int64(binary.BigEndian.Uint64(requestBody[offset : offset+8])) offset += 8 } // Partition max bytes (4 bytes) - always fixed if offset+4 > len(requestBody) { return nil, fmt.Errorf("insufficient data for partition max bytes") } request.Topics[i].Partitions[j].MaxBytes = int32(binary.BigEndian.Uint32(requestBody[offset : offset+4])) offset += 4 // Tagged fields for partition (only in flexible versions v12+) if isFlexible { _, consumed, err := DecodeTaggedFields(requestBody[offset:]) if err != nil { return nil, fmt.Errorf("decode partition tagged fields: %w", err) } offset += consumed } } // Tagged fields for topic (only in flexible versions v12+) if isFlexible { _, consumed, err := DecodeTaggedFields(requestBody[offset:]) if err != nil { return nil, fmt.Errorf("decode topic tagged fields: %w", err) } offset += consumed } } // Forgotten topics data (only in v7+) if apiVersion >= 7 { // Skip forgotten topics array - we don't use incremental fetch yet var forgottenTopicsCount int if isFlexible { length, consumed, err := DecodeCompactArrayLength(requestBody[offset:]) if err != nil { return nil, fmt.Errorf("decode forgotten topics compact array: %w", err) } forgottenTopicsCount = int(length) offset += consumed } else { if offset+4 > len(requestBody) { // End of request, no forgotten topics return request, nil } forgottenTopicsCount = int(binary.BigEndian.Uint32(requestBody[offset : offset+4])) offset += 4 } // Skip forgotten topics if present for i := 0; i < forgottenTopicsCount && offset < len(requestBody); i++ { // Skip topic name if isFlexible { _, consumed, err := DecodeFlexibleString(requestBody[offset:]) if err != nil { break } offset += consumed } else { if offset+2 > len(requestBody) { break } nameLen := int(binary.BigEndian.Uint16(requestBody[offset : offset+2])) offset += 2 + nameLen } // Skip partitions array if isFlexible { length, consumed, err := DecodeCompactArrayLength(requestBody[offset:]) if err != nil { break } offset += consumed // Skip partition IDs (4 bytes each) offset += int(length) * 4 } else { if offset+4 > len(requestBody) { break } partCount := int(binary.BigEndian.Uint32(requestBody[offset : offset+4])) offset += 4 + partCount*4 } // Skip tagged fields if flexible if isFlexible { _, consumed, err := DecodeTaggedFields(requestBody[offset:]) if err != nil { break } offset += consumed } } } // Rack ID (only in v11+) - optional string if apiVersion >= 11 && offset < len(requestBody) { if isFlexible { _, consumed, err := DecodeFlexibleString(requestBody[offset:]) if err == nil { offset += consumed } } else { if offset+2 <= len(requestBody) { rackIDLen := int(binary.BigEndian.Uint16(requestBody[offset : offset+2])) if rackIDLen >= 0 && offset+2+rackIDLen <= len(requestBody) { offset += 2 + rackIDLen } } } } // Top-level tagged fields (only in flexible versions v12+) if isFlexible && offset < len(requestBody) { _, consumed, err := DecodeTaggedFields(requestBody[offset:]) if err != nil { // Don't fail on trailing tagged fields parsing } else { offset += consumed } } return request, nil } // constructRecordBatchFromSMQ creates a Kafka record batch from SeaweedMQ records func (h *Handler) constructRecordBatchFromSMQ(topicName string, fetchOffset int64, smqRecords []integration.SMQRecord) []byte { if len(smqRecords) == 0 { return []byte{} } // Create record batch using the SMQ records batch := make([]byte, 0, 512) // Record batch header baseOffsetBytes := make([]byte, 8) binary.BigEndian.PutUint64(baseOffsetBytes, uint64(fetchOffset)) batch = append(batch, baseOffsetBytes...) // base offset (8 bytes) // Calculate batch length (will be filled after we know the size) batchLengthPos := len(batch) batch = append(batch, 0, 0, 0, 0) // batch length placeholder (4 bytes) // Partition leader epoch (4 bytes) - use 0 (real Kafka uses 0, not -1) batch = append(batch, 0x00, 0x00, 0x00, 0x00) // Magic byte (1 byte) - v2 format batch = append(batch, 2) // CRC placeholder (4 bytes) - will be calculated later crcPos := len(batch) batch = append(batch, 0, 0, 0, 0) // Attributes (2 bytes) - no compression, etc. batch = append(batch, 0, 0) // Last offset delta (4 bytes) lastOffsetDelta := int32(len(smqRecords) - 1) lastOffsetDeltaBytes := make([]byte, 4) binary.BigEndian.PutUint32(lastOffsetDeltaBytes, uint32(lastOffsetDelta)) batch = append(batch, lastOffsetDeltaBytes...) // Base timestamp (8 bytes) - convert from nanoseconds to milliseconds for Kafka compatibility baseTimestamp := smqRecords[0].GetTimestamp() / 1000000 // Convert nanoseconds to milliseconds baseTimestampBytes := make([]byte, 8) binary.BigEndian.PutUint64(baseTimestampBytes, uint64(baseTimestamp)) batch = append(batch, baseTimestampBytes...) // Max timestamp (8 bytes) - convert from nanoseconds to milliseconds for Kafka compatibility maxTimestamp := baseTimestamp if len(smqRecords) > 1 { maxTimestamp = smqRecords[len(smqRecords)-1].GetTimestamp() / 1000000 // Convert nanoseconds to milliseconds } maxTimestampBytes := make([]byte, 8) binary.BigEndian.PutUint64(maxTimestampBytes, uint64(maxTimestamp)) batch = append(batch, maxTimestampBytes...) // Producer ID (8 bytes) - use -1 for no producer ID batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF) // Producer epoch (2 bytes) - use -1 for no producer epoch batch = append(batch, 0xFF, 0xFF) // Base sequence (4 bytes) - use -1 for no base sequence batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF) // Records count (4 bytes) recordCountBytes := make([]byte, 4) binary.BigEndian.PutUint32(recordCountBytes, uint32(len(smqRecords))) batch = append(batch, recordCountBytes...) // Add individual records from SMQ records for i, smqRecord := range smqRecords { // Build individual record recordBytes := make([]byte, 0, 128) // Record attributes (1 byte) recordBytes = append(recordBytes, 0) // Timestamp delta (varint) - calculate from base timestamp (both in milliseconds) recordTimestampMs := smqRecord.GetTimestamp() / 1000000 // Convert nanoseconds to milliseconds timestampDelta := recordTimestampMs - baseTimestamp // Both in milliseconds now recordBytes = append(recordBytes, encodeVarint(timestampDelta)...) // Offset delta (varint) offsetDelta := int64(i) recordBytes = append(recordBytes, encodeVarint(offsetDelta)...) // Key length and key (varint + data) - decode RecordValue to get original Kafka message key := h.decodeRecordValueToKafkaMessage(topicName, smqRecord.GetKey()) if key == nil { recordBytes = append(recordBytes, encodeVarint(-1)...) // null key } else { recordBytes = append(recordBytes, encodeVarint(int64(len(key)))...) recordBytes = append(recordBytes, key...) } // Value length and value (varint + data) - decode RecordValue to get original Kafka message value := h.decodeRecordValueToKafkaMessage(topicName, smqRecord.GetValue()) if value == nil { recordBytes = append(recordBytes, encodeVarint(-1)...) // null value } else { recordBytes = append(recordBytes, encodeVarint(int64(len(value)))...) recordBytes = append(recordBytes, value...) } // Headers count (varint) - 0 headers recordBytes = append(recordBytes, encodeVarint(0)...) // Prepend record length (varint) recordLength := int64(len(recordBytes)) batch = append(batch, encodeVarint(recordLength)...) batch = append(batch, recordBytes...) } // Fill in the batch length batchLength := uint32(len(batch) - batchLengthPos - 4) binary.BigEndian.PutUint32(batch[batchLengthPos:batchLengthPos+4], batchLength) // Calculate CRC32 for the batch // Kafka CRC calculation covers: partition leader epoch + magic + attributes + ... (everything after batch length) // Skip: BaseOffset(8) + BatchLength(4) = 12 bytes crcData := batch[crcPos+4:] // CRC covers ONLY from attributes (byte 21) onwards // Skip CRC field itself, include rest crc := crc32.Checksum(crcData, crc32.MakeTable(crc32.Castagnoli)) binary.BigEndian.PutUint32(batch[crcPos:crcPos+4], crc) return batch } // encodeVarint encodes a signed integer using Kafka's varint encoding func encodeVarint(value int64) []byte { // Kafka uses zigzag encoding for signed integers zigzag := uint64((value << 1) ^ (value >> 63)) var buf []byte for zigzag >= 0x80 { buf = append(buf, byte(zigzag)|0x80) zigzag >>= 7 } buf = append(buf, byte(zigzag)) return buf } // reconstructSchematizedMessage reconstructs a schematized message from SMQ RecordValue func (h *Handler) reconstructSchematizedMessage(recordValue *schema_pb.RecordValue, metadata map[string]string) ([]byte, error) { // Only reconstruct if schema management is enabled if !h.IsSchemaEnabled() { return nil, fmt.Errorf("schema management not enabled") } // Extract schema information from metadata schemaIDStr, exists := metadata["schema_id"] if !exists { return nil, fmt.Errorf("no schema ID in metadata") } var schemaID uint32 if _, err := fmt.Sscanf(schemaIDStr, "%d", &schemaID); err != nil { return nil, fmt.Errorf("invalid schema ID: %w", err) } formatStr, exists := metadata["schema_format"] if !exists { return nil, fmt.Errorf("no schema format in metadata") } var format schema.Format switch formatStr { case "AVRO": format = schema.FormatAvro case "PROTOBUF": format = schema.FormatProtobuf case "JSON_SCHEMA": format = schema.FormatJSONSchema default: return nil, fmt.Errorf("unsupported schema format: %s", formatStr) } // Use schema manager to encode back to original format return h.schemaManager.EncodeMessage(recordValue, schemaID, format) } // SchematizedRecord holds both key and value for schematized messages type SchematizedRecord struct { Key []byte Value []byte } // fetchSchematizedRecords fetches and reconstructs schematized records from SeaweedMQ func (h *Handler) fetchSchematizedRecords(topicName string, partitionID int32, offset int64, maxBytes int32) ([]*SchematizedRecord, error) { glog.Infof("fetchSchematizedRecords: topic=%s partition=%d offset=%d maxBytes=%d", topicName, partitionID, offset, maxBytes) // Only proceed when schema feature is toggled on if !h.useSchema { glog.Infof("fetchSchematizedRecords EARLY RETURN: useSchema=false") return []*SchematizedRecord{}, nil } // Check if SeaweedMQ handler is available when schema feature is in use if h.seaweedMQHandler == nil { glog.Infof("fetchSchematizedRecords ERROR: seaweedMQHandler is nil") return nil, fmt.Errorf("SeaweedMQ handler not available") } // If schema management isn't fully configured, return empty instead of error if !h.IsSchemaEnabled() { glog.Infof("fetchSchematizedRecords EARLY RETURN: IsSchemaEnabled()=false") return []*SchematizedRecord{}, nil } // Fetch stored records from SeaweedMQ maxRecords := 100 // Reasonable batch size limit glog.Infof("fetchSchematizedRecords: calling GetStoredRecords maxRecords=%d", maxRecords) smqRecords, err := h.seaweedMQHandler.GetStoredRecords(context.Background(), topicName, partitionID, offset, maxRecords) if err != nil { glog.Infof("fetchSchematizedRecords ERROR: GetStoredRecords failed: %v", err) return nil, fmt.Errorf("failed to fetch SMQ records: %w", err) } glog.Infof("fetchSchematizedRecords: GetStoredRecords returned %d records", len(smqRecords)) if len(smqRecords) == 0 { return []*SchematizedRecord{}, nil } var reconstructedRecords []*SchematizedRecord totalBytes := int32(0) for _, smqRecord := range smqRecords { // Check if we've exceeded maxBytes limit if maxBytes > 0 && totalBytes >= maxBytes { break } // Try to reconstruct the schematized message value reconstructedValue, err := h.reconstructSchematizedMessageFromSMQ(smqRecord) if err != nil { // Log error but continue with other messages Error("Failed to reconstruct schematized message at offset %d: %v", smqRecord.GetOffset(), err) continue } if reconstructedValue != nil { // Create SchematizedRecord with both key and reconstructed value record := &SchematizedRecord{ Key: smqRecord.GetKey(), // Preserve the original key Value: reconstructedValue, // Use the reconstructed value } reconstructedRecords = append(reconstructedRecords, record) totalBytes += int32(len(record.Key) + len(record.Value)) } } return reconstructedRecords, nil } // reconstructSchematizedMessageFromSMQ reconstructs a schematized message from an SMQRecord func (h *Handler) reconstructSchematizedMessageFromSMQ(smqRecord integration.SMQRecord) ([]byte, error) { // Get the stored value (should be a serialized RecordValue) valueBytes := smqRecord.GetValue() if len(valueBytes) == 0 { return nil, fmt.Errorf("empty value in SMQ record") } // Try to unmarshal as RecordValue recordValue := &schema_pb.RecordValue{} if err := proto.Unmarshal(valueBytes, recordValue); err != nil { // If it's not a RecordValue, it might be a regular Kafka message // Return it as-is (non-schematized) return valueBytes, nil } // Extract schema metadata from the RecordValue fields metadata := h.extractSchemaMetadataFromRecord(recordValue) if len(metadata) == 0 { // No schema metadata found, treat as regular message return valueBytes, nil } // Remove Kafka metadata fields to get the original message content originalRecord := h.removeKafkaMetadataFields(recordValue) // Reconstruct the original Confluent envelope return h.reconstructSchematizedMessage(originalRecord, metadata) } // extractSchemaMetadataFromRecord extracts schema metadata from RecordValue fields func (h *Handler) extractSchemaMetadataFromRecord(recordValue *schema_pb.RecordValue) map[string]string { metadata := make(map[string]string) // Look for schema metadata fields in the record if schemaIDField := recordValue.Fields["_schema_id"]; schemaIDField != nil { if schemaIDValue := schemaIDField.GetStringValue(); schemaIDValue != "" { metadata["schema_id"] = schemaIDValue } } if schemaFormatField := recordValue.Fields["_schema_format"]; schemaFormatField != nil { if schemaFormatValue := schemaFormatField.GetStringValue(); schemaFormatValue != "" { metadata["schema_format"] = schemaFormatValue } } if schemaSubjectField := recordValue.Fields["_schema_subject"]; schemaSubjectField != nil { if schemaSubjectValue := schemaSubjectField.GetStringValue(); schemaSubjectValue != "" { metadata["schema_subject"] = schemaSubjectValue } } if schemaVersionField := recordValue.Fields["_schema_version"]; schemaVersionField != nil { if schemaVersionValue := schemaVersionField.GetStringValue(); schemaVersionValue != "" { metadata["schema_version"] = schemaVersionValue } } return metadata } // removeKafkaMetadataFields removes Kafka and schema metadata fields from RecordValue func (h *Handler) removeKafkaMetadataFields(recordValue *schema_pb.RecordValue) *schema_pb.RecordValue { originalRecord := &schema_pb.RecordValue{ Fields: make(map[string]*schema_pb.Value), } // Copy all fields except metadata fields for key, value := range recordValue.Fields { if !h.isMetadataField(key) { originalRecord.Fields[key] = value } } return originalRecord } // isMetadataField checks if a field is a metadata field that should be excluded from the original message func (h *Handler) isMetadataField(fieldName string) bool { return fieldName == "_kafka_offset" || fieldName == "_kafka_partition" || fieldName == "_kafka_timestamp" || fieldName == "_schema_id" || fieldName == "_schema_format" || fieldName == "_schema_subject" || fieldName == "_schema_version" } // createSchematizedRecordBatch creates a Kafka record batch from reconstructed schematized messages func (h *Handler) createSchematizedRecordBatch(records []*SchematizedRecord, baseOffset int64) []byte { if len(records) == 0 { // Return empty record batch return h.createEmptyRecordBatch(baseOffset) } // Create individual record entries for the batch var recordsData []byte currentTimestamp := time.Now().UnixMilli() for i, record := range records { // Create a record entry (Kafka record format v2) with both key and value recordEntry := h.createRecordEntry(record.Key, record.Value, int32(i), currentTimestamp) recordsData = append(recordsData, recordEntry...) } // Apply compression if the data is large enough to benefit enableCompression := len(recordsData) > 100 var compressionType compression.CompressionCodec = compression.None var finalRecordsData []byte if enableCompression { compressed, err := compression.Compress(compression.Gzip, recordsData) if err == nil && len(compressed) < len(recordsData) { finalRecordsData = compressed compressionType = compression.Gzip } else { finalRecordsData = recordsData } } else { finalRecordsData = recordsData } // Create the record batch with proper compression and CRC batch, err := h.createRecordBatchWithCompressionAndCRC(baseOffset, finalRecordsData, compressionType, int32(len(records)), currentTimestamp) if err != nil { // Fallback to simple batch creation return h.createRecordBatchWithPayload(baseOffset, int32(len(records)), finalRecordsData) } return batch } // createRecordEntry creates a single record entry in Kafka record format v2 func (h *Handler) createRecordEntry(messageKey []byte, messageData []byte, offsetDelta int32, timestamp int64) []byte { // Record format v2: // - length (varint) // - attributes (int8) // - timestamp delta (varint) // - offset delta (varint) // - key length (varint) + key // - value length (varint) + value // - headers count (varint) + headers var record []byte // Attributes (1 byte) - no special attributes record = append(record, 0) // Timestamp delta (varint) - 0 for now (all messages have same timestamp) record = append(record, encodeVarint(0)...) // Offset delta (varint) record = append(record, encodeVarint(int64(offsetDelta))...) // Key length (varint) + key if messageKey == nil || len(messageKey) == 0 { record = append(record, encodeVarint(-1)...) // -1 indicates null key } else { record = append(record, encodeVarint(int64(len(messageKey)))...) record = append(record, messageKey...) } // Value length (varint) + value record = append(record, encodeVarint(int64(len(messageData)))...) record = append(record, messageData...) // Headers count (varint) - no headers record = append(record, encodeVarint(0)...) // Prepend the total record length (varint) recordLength := encodeVarint(int64(len(record))) return append(recordLength, record...) } // createRecordBatchWithCompressionAndCRC creates a Kafka record batch with proper compression and CRC func (h *Handler) createRecordBatchWithCompressionAndCRC(baseOffset int64, recordsData []byte, compressionType compression.CompressionCodec, recordCount int32, baseTimestampMs int64) ([]byte, error) { // Create record batch header // Validate size to prevent overflow const maxBatchSize = 1 << 30 // 1 GB limit if len(recordsData) > maxBatchSize-61 { return nil, fmt.Errorf("records data too large: %d bytes", len(recordsData)) } batch := make([]byte, 0, len(recordsData)+61) // 61 bytes for header // Base offset (8 bytes) baseOffsetBytes := make([]byte, 8) binary.BigEndian.PutUint64(baseOffsetBytes, uint64(baseOffset)) batch = append(batch, baseOffsetBytes...) // Batch length placeholder (4 bytes) - will be filled later batchLengthPos := len(batch) batch = append(batch, 0, 0, 0, 0) // Partition leader epoch (4 bytes) batch = append(batch, 0, 0, 0, 0) // Magic byte (1 byte) - version 2 batch = append(batch, 2) // CRC placeholder (4 bytes) - will be calculated later crcPos := len(batch) batch = append(batch, 0, 0, 0, 0) // Attributes (2 bytes) - compression type and other flags attributes := int16(compressionType) // Set compression type in lower 3 bits attributesBytes := make([]byte, 2) binary.BigEndian.PutUint16(attributesBytes, uint16(attributes)) batch = append(batch, attributesBytes...) // Last offset delta (4 bytes) lastOffsetDelta := uint32(recordCount - 1) lastOffsetDeltaBytes := make([]byte, 4) binary.BigEndian.PutUint32(lastOffsetDeltaBytes, lastOffsetDelta) batch = append(batch, lastOffsetDeltaBytes...) // First timestamp (8 bytes) - use the same timestamp used to build record entries firstTimestampBytes := make([]byte, 8) binary.BigEndian.PutUint64(firstTimestampBytes, uint64(baseTimestampMs)) batch = append(batch, firstTimestampBytes...) // Max timestamp (8 bytes) - same as first for simplicity batch = append(batch, firstTimestampBytes...) // Producer ID (8 bytes) - -1 for non-transactional batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF) // Producer epoch (2 bytes) - -1 for non-transactional batch = append(batch, 0xFF, 0xFF) // Base sequence (4 bytes) - -1 for non-transactional batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF) // Record count (4 bytes) recordCountBytes := make([]byte, 4) binary.BigEndian.PutUint32(recordCountBytes, uint32(recordCount)) batch = append(batch, recordCountBytes...) // Records payload (compressed or uncompressed) batch = append(batch, recordsData...) // Calculate and set batch length (excluding base offset and batch length fields) batchLength := len(batch) - 12 // 8 bytes base offset + 4 bytes batch length binary.BigEndian.PutUint32(batch[batchLengthPos:batchLengthPos+4], uint32(batchLength)) // Calculate and set CRC32 over attributes..end (exclude CRC field itself) // Kafka uses Castagnoli (CRC-32C) algorithm. CRC covers ONLY from attributes offset (byte 21) onwards. // See: DefaultRecordBatch.java computeChecksum() - Crc32C.compute(buffer, ATTRIBUTES_OFFSET, ...) crcData := batch[crcPos+4:] // Skip CRC field itself (bytes 17..20) and include the rest crc := crc32.Checksum(crcData, crc32.MakeTable(crc32.Castagnoli)) binary.BigEndian.PutUint32(batch[crcPos:crcPos+4], crc) return batch, nil } // createEmptyRecordBatch creates an empty Kafka record batch using the new parser func (h *Handler) createEmptyRecordBatch(baseOffset int64) []byte { // Use the new record batch creation function with no compression emptyRecords := []byte{} batch, err := CreateRecordBatch(baseOffset, emptyRecords, compression.None) if err != nil { // Fallback to manual creation if there's an error return h.createEmptyRecordBatchManual(baseOffset) } return batch } // createEmptyRecordBatchManual creates an empty Kafka record batch manually (fallback) func (h *Handler) createEmptyRecordBatchManual(baseOffset int64) []byte { // Create a minimal empty record batch batch := make([]byte, 0, 61) // Standard record batch header size // Base offset (8 bytes) baseOffsetBytes := make([]byte, 8) binary.BigEndian.PutUint64(baseOffsetBytes, uint64(baseOffset)) batch = append(batch, baseOffsetBytes...) // Batch length (4 bytes) - will be filled at the end lengthPlaceholder := len(batch) batch = append(batch, 0, 0, 0, 0) // Partition leader epoch (4 bytes) - 0 for simplicity batch = append(batch, 0, 0, 0, 0) // Magic byte (1 byte) - version 2 batch = append(batch, 2) // CRC32 (4 bytes) - placeholder, should be calculated batch = append(batch, 0, 0, 0, 0) // Attributes (2 bytes) - no compression, no transactional batch = append(batch, 0, 0) // Last offset delta (4 bytes) - 0 for empty batch batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF) // First timestamp (8 bytes) - current time timestamp := time.Now().UnixMilli() timestampBytes := make([]byte, 8) binary.BigEndian.PutUint64(timestampBytes, uint64(timestamp)) batch = append(batch, timestampBytes...) // Max timestamp (8 bytes) - same as first for empty batch batch = append(batch, timestampBytes...) // Producer ID (8 bytes) - -1 for non-transactional batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF) // Producer Epoch (2 bytes) - -1 for non-transactional batch = append(batch, 0xFF, 0xFF) // Base Sequence (4 bytes) - -1 for non-transactional batch = append(batch, 0xFF, 0xFF, 0xFF, 0xFF) // Record count (4 bytes) - 0 for empty batch batch = append(batch, 0, 0, 0, 0) // Fill in the batch length batchLength := len(batch) - 12 // Exclude base offset and length field itself binary.BigEndian.PutUint32(batch[lengthPlaceholder:lengthPlaceholder+4], uint32(batchLength)) return batch } // createRecordBatchWithPayload creates a record batch with the given payload func (h *Handler) createRecordBatchWithPayload(baseOffset int64, recordCount int32, payload []byte) []byte { // For Phase 7, create a simplified record batch // In Phase 8, this will implement proper Kafka record batch format v2 batch := h.createEmptyRecordBatch(baseOffset) // Update record count recordCountOffset := len(batch) - 4 binary.BigEndian.PutUint32(batch[recordCountOffset:recordCountOffset+4], uint32(recordCount)) // Append payload (simplified - real implementation would format individual records) batch = append(batch, payload...) // Update batch length batchLength := len(batch) - 12 binary.BigEndian.PutUint32(batch[8:12], uint32(batchLength)) return batch } // handleSchematizedFetch handles fetch requests for topics with schematized messages func (h *Handler) handleSchematizedFetch(topicName string, partitionID int32, offset int64, maxBytes int32) ([]byte, error) { // Check if this topic uses schema management if !h.IsSchemaEnabled() { // Fall back to regular fetch handling return nil, fmt.Errorf("schema management not enabled") } // Fetch schematized records from SeaweedMQ records, err := h.fetchSchematizedRecords(topicName, partitionID, offset, maxBytes) if err != nil { return nil, fmt.Errorf("failed to fetch schematized records: %w", err) } // Create record batch from reconstructed records recordBatch := h.createSchematizedRecordBatch(records, offset) return recordBatch, nil } // isSchematizedTopic checks if a topic uses schema management func (h *Handler) isSchematizedTopic(topicName string) bool { // System topics (_schemas, __consumer_offsets, etc.) should NEVER use schema encoding // They have their own internal formats and should be passed through as-is if h.isSystemTopic(topicName) { return false } if !h.IsSchemaEnabled() { return false } // Check multiple indicators for schematized topics: // Check Confluent Schema Registry naming conventions return h.matchesSchemaRegistryConvention(topicName) } // matchesSchemaRegistryConvention checks Confluent Schema Registry naming patterns func (h *Handler) matchesSchemaRegistryConvention(topicName string) bool { // Common Schema Registry subject patterns: // - topicName-value (for message values) // - topicName-key (for message keys) // - topicName (direct topic name as subject) if len(topicName) > 6 && topicName[len(topicName)-6:] == "-value" { return true } if len(topicName) > 4 && topicName[len(topicName)-4:] == "-key" { return true } // Check if the topic has registered schema subjects in Schema Registry // Use standard Kafka naming convention: -value and -key if h.schemaManager != nil { // Check with -value suffix (standard pattern for value schemas) latestSchemaValue, err := h.schemaManager.GetLatestSchema(topicName + "-value") if err == nil { // Since we retrieved schema from registry, ensure topic config is updated h.ensureTopicSchemaFromLatestSchema(topicName, latestSchemaValue) return true } // Check with -key suffix (for key schemas) latestSchemaKey, err := h.schemaManager.GetLatestSchema(topicName + "-key") if err == nil { // Since we retrieved key schema from registry, ensure topic config is updated h.ensureTopicKeySchemaFromLatestSchema(topicName, latestSchemaKey) return true } } return false } // getSchemaMetadataForTopic retrieves schema metadata for a topic func (h *Handler) getSchemaMetadataForTopic(topicName string) (map[string]string, error) { if !h.IsSchemaEnabled() { return nil, fmt.Errorf("schema management not enabled") } // Try multiple approaches to get schema metadata from Schema Registry // 1. Try to get schema from registry using topic name as subject metadata, err := h.getSchemaMetadataFromRegistry(topicName) if err == nil { return metadata, nil } // 2. Try with -value suffix (common pattern) metadata, err = h.getSchemaMetadataFromRegistry(topicName + "-value") if err == nil { return metadata, nil } // 3. Try with -key suffix metadata, err = h.getSchemaMetadataFromRegistry(topicName + "-key") if err == nil { return metadata, nil } return nil, fmt.Errorf("no schema found in registry for topic %s (tried %s, %s-value, %s-key)", topicName, topicName, topicName, topicName) } // getSchemaMetadataFromRegistry retrieves schema metadata from Schema Registry func (h *Handler) getSchemaMetadataFromRegistry(subject string) (map[string]string, error) { if h.schemaManager == nil { return nil, fmt.Errorf("schema manager not available") } // Get latest schema for the subject cachedSchema, err := h.schemaManager.GetLatestSchema(subject) if err != nil { return nil, fmt.Errorf("failed to get schema for subject %s: %w", subject, err) } // Since we retrieved schema from registry, ensure topic config is updated // Extract topic name from subject (remove -key or -value suffix if present) topicName := h.extractTopicFromSubject(subject) if topicName != "" { h.ensureTopicSchemaFromLatestSchema(topicName, cachedSchema) } // Build metadata map // Detect format from schema content // Simple format detection - assume Avro for now format := schema.FormatAvro metadata := map[string]string{ "schema_id": fmt.Sprintf("%d", cachedSchema.LatestID), "schema_format": format.String(), "schema_subject": subject, "schema_version": fmt.Sprintf("%d", cachedSchema.Version), "schema_content": cachedSchema.Schema, } return metadata, nil } // ensureTopicSchemaFromLatestSchema ensures topic configuration is updated when latest schema is retrieved func (h *Handler) ensureTopicSchemaFromLatestSchema(topicName string, latestSchema *schema.CachedSubject) { if latestSchema == nil { return } // Convert CachedSubject to CachedSchema format for reuse // Note: CachedSubject has different field structure than expected cachedSchema := &schema.CachedSchema{ ID: latestSchema.LatestID, Schema: latestSchema.Schema, Subject: latestSchema.Subject, Version: latestSchema.Version, Format: schema.FormatAvro, // Default to Avro, could be improved with format detection CachedAt: latestSchema.CachedAt, } // Use existing function to handle the schema update h.ensureTopicSchemaFromRegistryCache(topicName, cachedSchema) } // extractTopicFromSubject extracts the topic name from a schema registry subject func (h *Handler) extractTopicFromSubject(subject string) string { // Remove common suffixes used in schema registry if strings.HasSuffix(subject, "-value") { return strings.TrimSuffix(subject, "-value") } if strings.HasSuffix(subject, "-key") { return strings.TrimSuffix(subject, "-key") } // If no suffix, assume subject name is the topic name return subject } // ensureTopicKeySchemaFromLatestSchema ensures topic configuration is updated when key schema is retrieved func (h *Handler) ensureTopicKeySchemaFromLatestSchema(topicName string, latestSchema *schema.CachedSubject) { if latestSchema == nil { return } // Convert CachedSubject to CachedSchema format for reuse // Note: CachedSubject has different field structure than expected cachedSchema := &schema.CachedSchema{ ID: latestSchema.LatestID, Schema: latestSchema.Schema, Subject: latestSchema.Subject, Version: latestSchema.Version, Format: schema.FormatAvro, // Default to Avro, could be improved with format detection CachedAt: latestSchema.CachedAt, } // Use existing function to handle the key schema update h.ensureTopicKeySchemaFromRegistryCache(topicName, cachedSchema) } // decodeRecordValueToKafkaMessage decodes a RecordValue back to the original Kafka message bytes func (h *Handler) decodeRecordValueToKafkaMessage(topicName string, recordValueBytes []byte) []byte { if recordValueBytes == nil { return nil } // CRITICAL FIX: For system topics like _schemas, _consumer_offsets, etc., // return the raw bytes as-is. These topics store Kafka's internal format (Avro, etc.) // and should NOT be processed as RecordValue protobuf messages. if strings.HasPrefix(topicName, "_") { return recordValueBytes } // Try to unmarshal as RecordValue recordValue := &schema_pb.RecordValue{} if err := proto.Unmarshal(recordValueBytes, recordValue); err != nil { // Not a RecordValue format - this is normal for Avro/JSON/raw Kafka messages // Return raw bytes as-is (Kafka consumers expect this) return recordValueBytes } // If schema management is enabled, re-encode the RecordValue to Confluent format if h.IsSchemaEnabled() { if encodedMsg, err := h.encodeRecordValueToConfluentFormat(topicName, recordValue); err == nil { return encodedMsg } else { } } // Fallback: convert RecordValue to JSON return h.recordValueToJSON(recordValue) } // encodeRecordValueToConfluentFormat re-encodes a RecordValue back to Confluent format func (h *Handler) encodeRecordValueToConfluentFormat(topicName string, recordValue *schema_pb.RecordValue) ([]byte, error) { if recordValue == nil { return nil, fmt.Errorf("RecordValue is nil") } // Get schema configuration from topic config schemaConfig, err := h.getTopicSchemaConfig(topicName) if err != nil { return nil, fmt.Errorf("failed to get topic schema config: %w", err) } // Use schema manager to encode RecordValue back to original format encodedBytes, err := h.schemaManager.EncodeMessage(recordValue, schemaConfig.ValueSchemaID, schemaConfig.ValueSchemaFormat) if err != nil { return nil, fmt.Errorf("failed to encode RecordValue: %w", err) } return encodedBytes, nil } // getTopicSchemaConfig retrieves schema configuration for a topic func (h *Handler) getTopicSchemaConfig(topicName string) (*TopicSchemaConfig, error) { h.topicSchemaConfigMu.RLock() defer h.topicSchemaConfigMu.RUnlock() if h.topicSchemaConfigs == nil { return nil, fmt.Errorf("no schema configuration available for topic: %s", topicName) } config, exists := h.topicSchemaConfigs[topicName] if !exists { return nil, fmt.Errorf("no schema configuration found for topic: %s", topicName) } return config, nil } // decodeRecordValueToKafkaKey decodes a key RecordValue back to the original Kafka key bytes func (h *Handler) decodeRecordValueToKafkaKey(topicName string, keyRecordValueBytes []byte) []byte { if keyRecordValueBytes == nil { return nil } // Try to get topic schema config schemaConfig, err := h.getTopicSchemaConfig(topicName) if err != nil || !schemaConfig.HasKeySchema { // No key schema config available, return raw bytes return keyRecordValueBytes } // Try to unmarshal as RecordValue recordValue := &schema_pb.RecordValue{} if err := proto.Unmarshal(keyRecordValueBytes, recordValue); err != nil { // If it's not a RecordValue, return the raw bytes return keyRecordValueBytes } // If key schema management is enabled, re-encode the RecordValue to Confluent format if h.IsSchemaEnabled() { if encodedKey, err := h.encodeKeyRecordValueToConfluentFormat(topicName, recordValue); err == nil { return encodedKey } } // Fallback: convert RecordValue to JSON return h.recordValueToJSON(recordValue) } // encodeKeyRecordValueToConfluentFormat re-encodes a key RecordValue back to Confluent format func (h *Handler) encodeKeyRecordValueToConfluentFormat(topicName string, recordValue *schema_pb.RecordValue) ([]byte, error) { if recordValue == nil { return nil, fmt.Errorf("key RecordValue is nil") } // Get schema configuration from topic config schemaConfig, err := h.getTopicSchemaConfig(topicName) if err != nil { return nil, fmt.Errorf("failed to get topic schema config: %w", err) } if !schemaConfig.HasKeySchema { return nil, fmt.Errorf("no key schema configured for topic: %s", topicName) } // Use schema manager to encode RecordValue back to original format encodedBytes, err := h.schemaManager.EncodeMessage(recordValue, schemaConfig.KeySchemaID, schemaConfig.KeySchemaFormat) if err != nil { return nil, fmt.Errorf("failed to encode key RecordValue: %w", err) } return encodedBytes, nil } // recordValueToJSON converts a RecordValue to JSON bytes (fallback) func (h *Handler) recordValueToJSON(recordValue *schema_pb.RecordValue) []byte { if recordValue == nil || recordValue.Fields == nil { return []byte("{}") } // Simple JSON conversion - in a real implementation, this would be more sophisticated jsonStr := "{" first := true for fieldName, fieldValue := range recordValue.Fields { if !first { jsonStr += "," } first = false jsonStr += fmt.Sprintf(`"%s":`, fieldName) switch v := fieldValue.Kind.(type) { case *schema_pb.Value_StringValue: jsonStr += fmt.Sprintf(`"%s"`, v.StringValue) case *schema_pb.Value_BytesValue: jsonStr += fmt.Sprintf(`"%s"`, string(v.BytesValue)) case *schema_pb.Value_Int32Value: jsonStr += fmt.Sprintf(`%d`, v.Int32Value) case *schema_pb.Value_Int64Value: jsonStr += fmt.Sprintf(`%d`, v.Int64Value) case *schema_pb.Value_BoolValue: jsonStr += fmt.Sprintf(`%t`, v.BoolValue) default: jsonStr += `null` } } jsonStr += "}" return []byte(jsonStr) } // fetchPartitionData fetches data for a single partition (called concurrently) func (h *Handler) fetchPartitionData( ctx context.Context, topicName string, partition FetchPartition, apiVersion uint16, isSchematizedTopic bool, ) *partitionFetchResult { startTime := time.Now() result := &partitionFetchResult{} // Get the actual high water mark from SeaweedMQ highWaterMark, err := h.seaweedMQHandler.GetLatestOffset(topicName, partition.PartitionID) if err != nil { highWaterMark = 0 } result.highWaterMark = highWaterMark // Check if topic exists if !h.seaweedMQHandler.TopicExists(topicName) { if isSystemTopic(topicName) { // Auto-create system topics if err := h.createTopicWithSchemaSupport(topicName, 1); err != nil { result.errorCode = 3 // UNKNOWN_TOPIC_OR_PARTITION result.fetchDuration = time.Since(startTime) return result } } else { result.errorCode = 3 // UNKNOWN_TOPIC_OR_PARTITION result.fetchDuration = time.Since(startTime) return result } } // Normalize special fetch offsets effectiveFetchOffset := partition.FetchOffset if effectiveFetchOffset < 0 { if effectiveFetchOffset == -2 { effectiveFetchOffset = 0 } else if effectiveFetchOffset == -1 { effectiveFetchOffset = highWaterMark } } // Fetch records if available var recordBatch []byte if highWaterMark > effectiveFetchOffset { // Use multi-batch fetcher (pass context to respect timeout) multiFetcher := NewMultiBatchFetcher(h) fetchResult, err := multiFetcher.FetchMultipleBatches( ctx, topicName, partition.PartitionID, effectiveFetchOffset, highWaterMark, partition.MaxBytes, ) if err == nil && fetchResult.TotalSize > 0 { recordBatch = fetchResult.RecordBatches } else { // Fallback to single batch (pass context to respect timeout) smqRecords, err := h.seaweedMQHandler.GetStoredRecords(ctx, topicName, partition.PartitionID, effectiveFetchOffset, 10) if err == nil && len(smqRecords) > 0 { recordBatch = h.constructRecordBatchFromSMQ(topicName, effectiveFetchOffset, smqRecords) } else { recordBatch = []byte{} } } } else { recordBatch = []byte{} } // Try schematized records if needed and recordBatch is empty if isSchematizedTopic && len(recordBatch) == 0 { schematizedRecords, err := h.fetchSchematizedRecords(topicName, partition.PartitionID, effectiveFetchOffset, partition.MaxBytes) if err == nil && len(schematizedRecords) > 0 { schematizedBatch := h.createSchematizedRecordBatch(schematizedRecords, effectiveFetchOffset) if len(schematizedBatch) > 0 { recordBatch = schematizedBatch } } } result.recordBatch = recordBatch result.fetchDuration = time.Since(startTime) return result }