seaweedfs/weed/query/engine/hybrid_message_scanner.go


								package engine


								import (

									"container/heap"

									"context"

									"encoding/binary"

									"encoding/json"

									"fmt"

									"io"

									"strconv"

									"strings"

									"sync"

									"sync/atomic"

									"time"


									"github.com/parquet-go/parquet-go"

									"github.com/seaweedfs/seaweedfs/weed/filer"

									"github.com/seaweedfs/seaweedfs/weed/mq"

									"github.com/seaweedfs/seaweedfs/weed/mq/logstore"

									"github.com/seaweedfs/seaweedfs/weed/mq/schema"

									"github.com/seaweedfs/seaweedfs/weed/mq/topic"

									"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"

									"github.com/seaweedfs/seaweedfs/weed/pb/mq_pb"

									"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"

									"github.com/seaweedfs/seaweedfs/weed/query/sqltypes"

									"github.com/seaweedfs/seaweedfs/weed/util"

									"github.com/seaweedfs/seaweedfs/weed/util/chunk_cache"

									"github.com/seaweedfs/seaweedfs/weed/util/log_buffer"

									"github.com/seaweedfs/seaweedfs/weed/wdclient"

									"google.golang.org/protobuf/proto"

								)


								// HybridMessageScanner scans from ALL data sources:

								// Architecture:

								// 1. Unflushed in-memory data from brokers (mq_pb.DataMessage format) - REAL-TIME

								// 2. Recent/live messages in log files (filer_pb.LogEntry format) - FLUSHED

								// 3. Older messages in Parquet files (schema_pb.RecordValue format) - ARCHIVED

								// 4. Seamlessly merges data from all sources chronologically

								// 5. Provides complete real-time view of all messages in a topic

								type HybridMessageScanner struct {

									filerClient   filer_pb.FilerClient

									brokerClient  BrokerClientInterface // For querying unflushed data

									topic         topic.Topic

									recordSchema  *schema_pb.RecordType

									schemaFormat  string // Serialization format: "AVRO", "PROTOBUF", "JSON_SCHEMA", or empty for schemaless

									parquetLevels *schema.ParquetLevels

									engine        *SQLEngine // Reference for system column formatting

								}


								// NewHybridMessageScanner creates a scanner that reads from all data sources

								// This provides complete real-time message coverage including unflushed data

								func NewHybridMessageScanner(filerClient filer_pb.FilerClient, brokerClient BrokerClientInterface, namespace, topicName string, engine *SQLEngine) (*HybridMessageScanner, error) {

									// Check if filerClient is available

									if filerClient == nil {

										return nil, fmt.Errorf("filerClient is required but not available")

									}


									// Create topic reference

									t := topic.Topic{

										Namespace: namespace,

										Name:      topicName,

									}


									// Get flat schema from broker client

									recordType, _, schemaFormat, err := brokerClient.GetTopicSchema(context.Background(), namespace, topicName)

									if err != nil {

										return nil, fmt.Errorf("failed to get topic record type: %v", err)

									}


									if recordType == nil || len(recordType.Fields) == 0 {

										// For topics without schema, create a minimal schema with system fields and _value

										recordType = schema.RecordTypeBegin().

											WithField(SW_COLUMN_NAME_TIMESTAMP, schema.TypeInt64).

											WithField(SW_COLUMN_NAME_KEY, schema.TypeBytes).

											WithField(SW_COLUMN_NAME_VALUE, schema.TypeBytes). // Raw message value

											RecordTypeEnd()

									} else {

										// Create a copy of the recordType to avoid modifying the original

										recordTypeCopy := &schema_pb.RecordType{

											Fields: make([]*schema_pb.Field, len(recordType.Fields)),

										}

										copy(recordTypeCopy.Fields, recordType.Fields)


										// Add system columns that MQ adds to all records

										recordType = schema.NewRecordTypeBuilder(recordTypeCopy).

											WithField(SW_COLUMN_NAME_TIMESTAMP, schema.TypeInt64).

											WithField(SW_COLUMN_NAME_KEY, schema.TypeBytes).

											RecordTypeEnd()

									}


									// Convert to Parquet levels for efficient reading

									parquetLevels, err := schema.ToParquetLevels(recordType)

									if err != nil {

										return nil, fmt.Errorf("failed to create Parquet levels: %v", err)

									}


									return &HybridMessageScanner{

										filerClient:   filerClient,

										brokerClient:  brokerClient,

										topic:         t,

										recordSchema:  recordType,

										schemaFormat:  schemaFormat,

										parquetLevels: parquetLevels,

										engine:        engine,

									}, nil

								}


								// HybridScanOptions configure how the scanner reads from both live and archived data

								type HybridScanOptions struct {

									// Time range filtering (Unix nanoseconds)

									StartTimeNs int64

									StopTimeNs  int64


									// Column projection - if empty, select all columns

									Columns []string


									// Row limit - 0 means no limit

									Limit int


									// Row offset - 0 means no offset

									Offset int


									// Predicate for WHERE clause filtering

									Predicate func(*schema_pb.RecordValue) bool

								}


								// HybridScanResult represents a message from either live logs or Parquet files

								type HybridScanResult struct {

									Values    map[string]*schema_pb.Value // Column name -> value

									Timestamp int64                       // Message timestamp (_ts_ns)

									Key       []byte                      // Message key (_key)

									Source    string                      // "live_log" or "parquet_archive" or "in_memory_broker"

								}


								// HybridScanStats contains statistics about data sources scanned

								type HybridScanStats struct {

									BrokerBufferQueried  bool

									BrokerBufferMessages int

									BufferStartIndex     int64

									PartitionsScanned    int

									LiveLogFilesScanned  int // Number of live log files processed

								}


								// ParquetColumnStats holds statistics for a single column from parquet metadata

								type ParquetColumnStats struct {

									ColumnName string

									MinValue   *schema_pb.Value

									MaxValue   *schema_pb.Value

									NullCount  int64

									RowCount   int64

								}


								// ParquetFileStats holds aggregated statistics for a parquet file

								type ParquetFileStats struct {

									FileName    string

									RowCount    int64

									ColumnStats map[string]*ParquetColumnStats

									// Optional file-level timestamp range from filer extended attributes

									MinTimestampNs int64

									MaxTimestampNs int64

								}


								// getTimestampRangeFromStats returns (minTsNs, maxTsNs, ok) by inspecting common timestamp columns

								func (h *HybridMessageScanner) getTimestampRangeFromStats(fileStats *ParquetFileStats) (int64, int64, bool) {

									if fileStats == nil {

										return 0, 0, false

									}

									// Prefer column stats for _ts_ns if present

									if len(fileStats.ColumnStats) > 0 {

										if s, ok := fileStats.ColumnStats[logstore.SW_COLUMN_NAME_TS]; ok && s != nil && s.MinValue != nil && s.MaxValue != nil {

											if minNs, okMin := h.schemaValueToNs(s.MinValue); okMin {

												if maxNs, okMax := h.schemaValueToNs(s.MaxValue); okMax {

													return minNs, maxNs, true

												}

											}

										}

									}

									// Fallback to file-level range if present in filer extended metadata

									if fileStats.MinTimestampNs != 0 || fileStats.MaxTimestampNs != 0 {

										return fileStats.MinTimestampNs, fileStats.MaxTimestampNs, true

									}

									return 0, 0, false

								}


								// schemaValueToNs converts a schema_pb.Value that represents a timestamp to ns

								func (h *HybridMessageScanner) schemaValueToNs(v *schema_pb.Value) (int64, bool) {

									if v == nil {

										return 0, false

									}

									switch k := v.Kind.(type) {

									case *schema_pb.Value_Int64Value:

										return k.Int64Value, true

									case *schema_pb.Value_Int32Value:

										return int64(k.Int32Value), true

									default:

										return 0, false

									}

								}


								// StreamingDataSource provides a streaming interface for reading scan results

								type StreamingDataSource interface {

									Next() (*HybridScanResult, error) // Returns next result or nil when done

									HasMore() bool                    // Returns true if more data available

									Close() error                     // Clean up resources

								}


								// StreamingMergeItem represents an item in the priority queue for streaming merge

								type StreamingMergeItem struct {

									Result     *HybridScanResult

									SourceID   int

									DataSource StreamingDataSource

								}


								// StreamingMergeHeap implements heap.Interface for merging sorted streams by timestamp

								type StreamingMergeHeap []*StreamingMergeItem


								func (h StreamingMergeHeap) Len() int { return len(h) }


								func (h StreamingMergeHeap) Less(i, j int) bool {

									// Sort by timestamp (ascending order)

									return h[i].Result.Timestamp < h[j].Result.Timestamp

								}


								func (h StreamingMergeHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }


								func (h *StreamingMergeHeap) Push(x interface{}) {

									*h = append(*h, x.(*StreamingMergeItem))

								}


								func (h *StreamingMergeHeap) Pop() interface{} {

									old := *h

									n := len(old)

									item := old[n-1]

									*h = old[0 : n-1]

									return item

								}


								// Scan reads messages from both live logs and archived Parquet files

								// Uses SeaweedFS MQ's GenMergedReadFunc for seamless integration

								// Assumptions:

								// 1. Chronologically merges live and archived data

								// 2. Applies filtering at the lowest level for efficiency

								// 3. Handles schema evolution transparently

								func (hms *HybridMessageScanner) Scan(ctx context.Context, options HybridScanOptions) ([]HybridScanResult, error) {

									results, _, err := hms.ScanWithStats(ctx, options)

									return results, err

								}


								// ScanWithStats reads messages and returns scan statistics for execution plans

								func (hms *HybridMessageScanner) ScanWithStats(ctx context.Context, options HybridScanOptions) ([]HybridScanResult, *HybridScanStats, error) {

									var results []HybridScanResult

									stats := &HybridScanStats{}


									// Get all partitions for this topic via MQ broker discovery

									partitions, err := hms.discoverTopicPartitions(ctx)

									if err != nil {

										return nil, stats, fmt.Errorf("failed to discover partitions for topic %s: %v", hms.topic.String(), err)

									}


									stats.PartitionsScanned = len(partitions)


									for _, partition := range partitions {

										partitionResults, partitionStats, err := hms.scanPartitionHybridWithStats(ctx, partition, options)

										if err != nil {

											return nil, stats, fmt.Errorf("failed to scan partition %v: %v", partition, err)

										}


										results = append(results, partitionResults...)


										// Aggregate broker buffer stats

										if partitionStats != nil {

											if partitionStats.BrokerBufferQueried {

												stats.BrokerBufferQueried = true

											}

											stats.BrokerBufferMessages += partitionStats.BrokerBufferMessages

											if partitionStats.BufferStartIndex > 0 && (stats.BufferStartIndex == 0 || partitionStats.BufferStartIndex < stats.BufferStartIndex) {

												stats.BufferStartIndex = partitionStats.BufferStartIndex

											}

										}


										// Apply global limit (without offset) across all partitions

										// When OFFSET is used, collect more data to ensure we have enough after skipping

										// Note: OFFSET will be applied at the end to avoid double-application

										if options.Limit > 0 {

											// Collect exact amount needed: LIMIT + OFFSET (no excessive doubling)

											minRequired := options.Limit + options.Offset

											// Small buffer only when needed to handle edge cases in distributed scanning

											if options.Offset > 0 && minRequired < 10 {

												minRequired = minRequired + 1 // Add 1 extra row buffer, not doubling

											}

											if len(results) >= minRequired {

												break

											}

										}

									}


									// Apply final OFFSET and LIMIT processing (done once at the end)

									// Limit semantics: -1 = no limit, 0 = LIMIT 0 (empty), >0 = limit to N rows

									if options.Offset > 0 || options.Limit >= 0 {

										// Handle LIMIT 0 special case first

										if options.Limit == 0 {

											return []HybridScanResult{}, stats, nil

										}


										// Apply OFFSET first

										if options.Offset > 0 {

											if options.Offset >= len(results) {

												results = []HybridScanResult{}

											} else {

												results = results[options.Offset:]

											}

										}


										// Apply LIMIT after OFFSET (only if limit > 0)

										if options.Limit > 0 && len(results) > options.Limit {

											results = results[:options.Limit]

										}

									}


									return results, stats, nil

								}


								// scanUnflushedData queries brokers for unflushed in-memory data using buffer_start deduplication

								func (hms *HybridMessageScanner) scanUnflushedData(ctx context.Context, partition topic.Partition, options HybridScanOptions) ([]HybridScanResult, error) {

									results, _, err := hms.scanUnflushedDataWithStats(ctx, partition, options)

									return results, err

								}


								// scanUnflushedDataWithStats queries brokers for unflushed data and returns statistics

								func (hms *HybridMessageScanner) scanUnflushedDataWithStats(ctx context.Context, partition topic.Partition, options HybridScanOptions) ([]HybridScanResult, *HybridScanStats, error) {

									var results []HybridScanResult

									stats := &HybridScanStats{}


									// Skip if no broker client available

									if hms.brokerClient == nil {

										return results, stats, nil

									}


									// Mark that we attempted to query broker buffer

									stats.BrokerBufferQueried = true


									// Step 1: Get unflushed data from broker using buffer_start-based method

									// This method uses buffer_start metadata to avoid double-counting with exact precision

									unflushedEntries, err := hms.brokerClient.GetUnflushedMessages(ctx, hms.topic.Namespace, hms.topic.Name, partition, options.StartTimeNs)

									if err != nil {

										// Log error but don't fail the query - continue with disk data only

										// Reset queried flag on error

										stats.BrokerBufferQueried = false

										return results, stats, nil

									}


									// Capture stats for EXPLAIN

									stats.BrokerBufferMessages = len(unflushedEntries)


									// Step 2: Process unflushed entries (already deduplicated by broker)

									for _, logEntry := range unflushedEntries {

										// Pre-decode DataMessage for reuse in both control check and conversion

										var dataMessage *mq_pb.DataMessage

										if len(logEntry.Data) > 0 {

											dataMessage = &mq_pb.DataMessage{}

											if err := proto.Unmarshal(logEntry.Data, dataMessage); err != nil {

												dataMessage = nil // Failed to decode, treat as raw data

											}

										}


										// Skip control entries without actual data

										if hms.isControlEntryWithDecoded(logEntry, dataMessage) {

											continue // Skip this entry

										}


										// Skip messages outside time range

										if options.StartTimeNs > 0 && logEntry.TsNs < options.StartTimeNs {

											continue

										}

										if options.StopTimeNs > 0 && logEntry.TsNs > options.StopTimeNs {

											continue

										}


										// Convert LogEntry to RecordValue format (same as disk data)

										recordValue, _, err := hms.convertLogEntryToRecordValueWithDecoded(logEntry, dataMessage)

										if err != nil {

											continue // Skip malformed messages

										}


										// Apply predicate filter if provided

										if options.Predicate != nil && !options.Predicate(recordValue) {

											continue

										}


										// Extract system columns for result

										timestamp := recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()

										key := recordValue.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()


										// Apply column projection

										values := make(map[string]*schema_pb.Value)

										if len(options.Columns) == 0 {

											// Select all columns (excluding system columns from user view)

											for name, value := range recordValue.Fields {

												if name != SW_COLUMN_NAME_TIMESTAMP && name != SW_COLUMN_NAME_KEY {

													values[name] = value

												}

											}

										} else {

											// Select specified columns only

											for _, columnName := range options.Columns {

												if value, exists := recordValue.Fields[columnName]; exists {

													values[columnName] = value

												}

											}

										}


										// Create result with proper source tagging

										result := HybridScanResult{

											Values:    values,

											Timestamp: timestamp,

											Key:       key,

											Source:    "live_log", // Data from broker's unflushed messages

										}


										results = append(results, result)


										// Apply limit (accounting for offset) - collect exact amount needed

										if options.Limit > 0 {

											// Collect exact amount needed: LIMIT + OFFSET (no excessive doubling)

											minRequired := options.Limit + options.Offset

											// Small buffer only when needed to handle edge cases in message streaming

											if options.Offset > 0 && minRequired < 10 {

												minRequired = minRequired + 1 // Add 1 extra row buffer, not doubling

											}

											if len(results) >= minRequired {

												break

											}

										}

									}


									return results, stats, nil

								}


								// convertDataMessageToRecord converts mq_pb.DataMessage to schema_pb.RecordValue

								func (hms *HybridMessageScanner) convertDataMessageToRecord(msg *mq_pb.DataMessage) (*schema_pb.RecordValue, string, error) {

									// Parse the message data as RecordValue

									recordValue := &schema_pb.RecordValue{}

									if err := proto.Unmarshal(msg.Value, recordValue); err != nil {

										return nil, "", fmt.Errorf("failed to unmarshal message data: %v", err)

									}


									// Add system columns

									if recordValue.Fields == nil {

										recordValue.Fields = make(map[string]*schema_pb.Value)

									}


									// Add timestamp

									recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP] = &schema_pb.Value{

										Kind: &schema_pb.Value_Int64Value{Int64Value: msg.TsNs},

									}


									return recordValue, string(msg.Key), nil

								}


								// discoverTopicPartitions discovers the actual partitions for this topic by scanning the filesystem

								// This finds real partition directories like v2025-09-01-07-16-34/0000-0630/

								func (hms *HybridMessageScanner) discoverTopicPartitions(ctx context.Context) ([]topic.Partition, error) {

									if hms.filerClient == nil {

										return nil, fmt.Errorf("filerClient not available for partition discovery")

									}


									var allPartitions []topic.Partition

									var err error


									// Scan the topic directory for actual partition versions (timestamped directories)

									// List all version directories in the topic directory

									err = filer_pb.ReadDirAllEntries(ctx, hms.filerClient, util.FullPath(hms.topic.Dir()), "", func(versionEntry *filer_pb.Entry, isLast bool) error {

										if !versionEntry.IsDirectory {

											return nil // Skip non-directories

										}


										// Parse version timestamp from directory name (e.g., "v2025-09-01-07-16-34")

										versionTime, parseErr := topic.ParseTopicVersion(versionEntry.Name)

										if parseErr != nil {

											// Skip directories that don't match the version format

											return nil

										}


										// Scan partition directories within this version

										versionDir := fmt.Sprintf("%s/%s", hms.topic.Dir(), versionEntry.Name)

										return filer_pb.ReadDirAllEntries(ctx, hms.filerClient, util.FullPath(versionDir), "", func(partitionEntry *filer_pb.Entry, isLast bool) error {

											if !partitionEntry.IsDirectory {

												return nil // Skip non-directories

											}


											// Parse partition boundary from directory name (e.g., "0000-0630")

											rangeStart, rangeStop := topic.ParsePartitionBoundary(partitionEntry.Name)

											if rangeStart == rangeStop {

												return nil // Skip invalid partition names

											}


											// Create partition object

											partition := topic.Partition{

												RangeStart: rangeStart,

												RangeStop:  rangeStop,

												RingSize:   topic.PartitionCount,

												UnixTimeNs: versionTime.UnixNano(),

											}


											allPartitions = append(allPartitions, partition)

											return nil

										})

									})


									if err != nil {

										return nil, fmt.Errorf("failed to scan topic directory for partitions: %v", err)

									}


									// If no partitions found, return empty slice (valid for newly created or empty topics)

									if len(allPartitions) == 0 {

										fmt.Printf("No partitions found for topic %s - returning empty result set\n", hms.topic.String())

										return []topic.Partition{}, nil

									}


									fmt.Printf("Discovered %d partitions for topic %s\n", len(allPartitions), hms.topic.String())

									return allPartitions, nil

								}


								// scanPartitionHybrid scans a specific partition using the hybrid approach

								// This is where the magic happens - seamlessly reading ALL data sources:

								// 1. Unflushed in-memory data from brokers (REAL-TIME)

								// 2. Live logs + Parquet files from disk (FLUSHED/ARCHIVED)

								func (hms *HybridMessageScanner) scanPartitionHybrid(ctx context.Context, partition topic.Partition, options HybridScanOptions) ([]HybridScanResult, error) {

									results, _, err := hms.scanPartitionHybridWithStats(ctx, partition, options)

									return results, err

								}


								// scanPartitionHybridWithStats scans a specific partition using streaming merge for memory efficiency

								// PERFORMANCE IMPROVEMENT: Uses heap-based streaming merge instead of collecting all data and sorting

								// - Memory usage: O(k) where k = number of data sources, instead of O(n) where n = total records

								// - Scalable: Can handle large topics without LIMIT clauses efficiently

								// - Streaming: Processes data as it arrives rather than buffering everything

								func (hms *HybridMessageScanner) scanPartitionHybridWithStats(ctx context.Context, partition topic.Partition, options HybridScanOptions) ([]HybridScanResult, *HybridScanStats, error) {

									stats := &HybridScanStats{}


									// STEP 1: Scan unflushed in-memory data from brokers (REAL-TIME)

									unflushedResults, unflushedStats, err := hms.scanUnflushedDataWithStats(ctx, partition, options)

									if err != nil {

										// Don't fail the query if broker scanning fails, but provide clear warning to user

										// This ensures users are aware that results may not include the most recent data

										fmt.Printf("Warning: Unable to access real-time data from message broker: %v\n", err)

										fmt.Printf("Note: Query results may not include the most recent unflushed messages\n")

									} else if unflushedStats != nil {

										stats.BrokerBufferQueried = unflushedStats.BrokerBufferQueried

										stats.BrokerBufferMessages = unflushedStats.BrokerBufferMessages

										stats.BufferStartIndex = unflushedStats.BufferStartIndex

									}


									// Count live log files for statistics

									liveLogCount, err := hms.countLiveLogFiles(partition)

									if err != nil {

										// Don't fail the query, just log warning

										fmt.Printf("Warning: Failed to count live log files: %v\n", err)

										liveLogCount = 0

									}

									stats.LiveLogFilesScanned = liveLogCount


									// STEP 2: Create streaming data sources for memory-efficient merge

									var dataSources []StreamingDataSource


									// Add unflushed data source (if we have unflushed results)

									if len(unflushedResults) > 0 {

										// Sort unflushed results by timestamp before creating stream

										if len(unflushedResults) > 1 {

											hms.mergeSort(unflushedResults, 0, len(unflushedResults)-1)

										}

										dataSources = append(dataSources, NewSliceDataSource(unflushedResults))

									}


									// Add streaming flushed data source (live logs + Parquet files)

									flushedDataSource := NewStreamingFlushedDataSource(hms, partition, options)

									dataSources = append(dataSources, flushedDataSource)


									// STEP 3: Use streaming merge for memory-efficient chronological ordering

									var results []HybridScanResult

									if len(dataSources) > 0 {

										// Calculate how many rows we need to collect during scanning (before OFFSET/LIMIT)

										// For LIMIT N OFFSET M, we need to collect at least N+M rows

										scanLimit := options.Limit

										if options.Limit > 0 && options.Offset > 0 {

											scanLimit = options.Limit + options.Offset

										}


										mergedResults, err := hms.streamingMerge(dataSources, scanLimit)

										if err != nil {

											return nil, stats, fmt.Errorf("streaming merge failed: %v", err)

										}

										results = mergedResults

									}


									return results, stats, nil

								}


								// countLiveLogFiles counts the number of live log files in a partition for statistics

								func (hms *HybridMessageScanner) countLiveLogFiles(partition topic.Partition) (int, error) {

									partitionDir := topic.PartitionDir(hms.topic, partition)


									var fileCount int

									err := hms.filerClient.WithFilerClient(false, func(client filer_pb.SeaweedFilerClient) error {

										// List all files in partition directory

										request := &filer_pb.ListEntriesRequest{

											Directory:          partitionDir,

											Prefix:             "",

											StartFromFileName:  "",

											InclusiveStartFrom: true,

											Limit:              10000, // reasonable limit for counting

										}


										stream, err := client.ListEntries(context.Background(), request)

										if err != nil {

											return err

										}


										for {

											resp, err := stream.Recv()

											if err == io.EOF {

												break

											}

											if err != nil {

												return err

											}


											// Count files that are not .parquet files (live log files)

											// Live log files typically have timestamps or are named like log files

											fileName := resp.Entry.Name

											if !strings.HasSuffix(fileName, ".parquet") &&

												!strings.HasSuffix(fileName, ".offset") &&

												len(resp.Entry.Chunks) > 0 { // Has actual content

												fileCount++

											}

										}


										return nil

									})


									if err != nil {

										return 0, err

									}

									return fileCount, nil

								}


								// isControlEntry checks if a log entry is a control entry without actual data

								// Based on MQ system analysis, control entries are:

								// 1. DataMessages with populated Ctrl field (publisher close signals)

								// 2. Entries with empty keys (as filtered by subscriber)

								// NOTE: Messages with empty data but valid keys (like NOOP messages) are NOT control entries

								func (hms *HybridMessageScanner) isControlEntry(logEntry *filer_pb.LogEntry) bool {

									// Pre-decode DataMessage if needed

									var dataMessage *mq_pb.DataMessage

									if len(logEntry.Data) > 0 {

										dataMessage = &mq_pb.DataMessage{}

										if err := proto.Unmarshal(logEntry.Data, dataMessage); err != nil {

											dataMessage = nil // Failed to decode, treat as raw data

										}

									}

									return hms.isControlEntryWithDecoded(logEntry, dataMessage)

								}


								// isControlEntryWithDecoded checks if a log entry is a control entry using pre-decoded DataMessage

								// This avoids duplicate protobuf unmarshaling when the DataMessage is already decoded

								func (hms *HybridMessageScanner) isControlEntryWithDecoded(logEntry *filer_pb.LogEntry, dataMessage *mq_pb.DataMessage) bool {

									// Skip entries with empty keys (same logic as subscriber)

									if len(logEntry.Key) == 0 {

										return true

									}


									// Check if this is a DataMessage with control field populated

									if dataMessage != nil && dataMessage.Ctrl != nil {

										return true

									}


									// Messages with valid keys (even if data is empty) are legitimate messages

									// Examples: NOOP messages from Schema Registry

									return false

								}


								// isNullOrEmpty checks if a schema_pb.Value is null or empty

								func isNullOrEmpty(value *schema_pb.Value) bool {

									if value == nil {

										return true

									}


									switch v := value.Kind.(type) {

									case *schema_pb.Value_StringValue:

										return v.StringValue == ""

									case *schema_pb.Value_BytesValue:

										return len(v.BytesValue) == 0

									case *schema_pb.Value_ListValue:

										return v.ListValue == nil || len(v.ListValue.Values) == 0

									case nil:

										return true // No kind set means null

									default:

										return false

									}

								}


								// isSchemaless checks if the scanner is configured for a schema-less topic

								// Schema-less topics only have system fields: _ts_ns, _key, and _value

								func (hms *HybridMessageScanner) isSchemaless() bool {

									// Schema-less topics only have system fields: _ts_ns, _key, and _value

									// System topics like _schemas are NOT schema-less - they have structured data

									// We just need to map their fields during read


									if hms.recordSchema == nil {

										return false

									}


									// Count only non-system data fields (exclude _ts_ns and _key which are always present)

									// Schema-less topics should only have _value as the data field

									hasValue := false

									dataFieldCount := 0


									for _, field := range hms.recordSchema.Fields {

										switch field.Name {

										case SW_COLUMN_NAME_TIMESTAMP, SW_COLUMN_NAME_KEY:

											// System fields - ignore

											continue

										case SW_COLUMN_NAME_VALUE:

											hasValue = true

											dataFieldCount++

										default:

											// Any other field means it's not schema-less

											dataFieldCount++

										}

									}


									// Schema-less = only has _value field as the data field (plus system fields)

									return hasValue && dataFieldCount == 1

								}


								// convertLogEntryToRecordValue converts a filer_pb.LogEntry to schema_pb.RecordValue

								// This handles both:

								// 1. Live log entries (raw message format)

								// 2. Parquet entries (already in schema_pb.RecordValue format)

								// 3. Schema-less topics (raw bytes in _value field)

								func (hms *HybridMessageScanner) convertLogEntryToRecordValue(logEntry *filer_pb.LogEntry) (*schema_pb.RecordValue, string, error) {

									// For schema-less topics, put raw data directly into _value field

									if hms.isSchemaless() {

										recordValue := &schema_pb.RecordValue{

											Fields: make(map[string]*schema_pb.Value),

										}

										recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP] = &schema_pb.Value{

											Kind: &schema_pb.Value_Int64Value{Int64Value: logEntry.TsNs},

										}

										recordValue.Fields[SW_COLUMN_NAME_KEY] = &schema_pb.Value{

											Kind: &schema_pb.Value_BytesValue{BytesValue: logEntry.Key},

										}

										recordValue.Fields[SW_COLUMN_NAME_VALUE] = &schema_pb.Value{

											Kind: &schema_pb.Value_BytesValue{BytesValue: logEntry.Data},

										}

										return recordValue, "live_log", nil

									}


									// Try to unmarshal as RecordValue first (Parquet format)

									recordValue := &schema_pb.RecordValue{}

									if err := proto.Unmarshal(logEntry.Data, recordValue); err == nil {

										// This is an archived message from Parquet files

										// FIX: Add system columns from LogEntry to RecordValue

										if recordValue.Fields == nil {

											recordValue.Fields = make(map[string]*schema_pb.Value)

										}


										// Add system columns from LogEntry

										recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP] = &schema_pb.Value{

											Kind: &schema_pb.Value_Int64Value{Int64Value: logEntry.TsNs},

										}

										recordValue.Fields[SW_COLUMN_NAME_KEY] = &schema_pb.Value{

											Kind: &schema_pb.Value_BytesValue{BytesValue: logEntry.Key},

										}


										return recordValue, "parquet_archive", nil

									}


									// If not a RecordValue, this is raw live message data - parse with schema

									return hms.parseRawMessageWithSchema(logEntry)

								}


								// min returns the minimum of two integers

								func min(a, b int) int {

									if a < b {

										return a

									}

									return b

								}


								// parseRawMessageWithSchema parses raw live message data using the topic's schema

								// This provides proper type conversion and field mapping instead of treating everything as strings

								func (hms *HybridMessageScanner) parseRawMessageWithSchema(logEntry *filer_pb.LogEntry) (*schema_pb.RecordValue, string, error) {

									recordValue := &schema_pb.RecordValue{

										Fields: make(map[string]*schema_pb.Value),

									}


									// Add system columns (always present)

									recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP] = &schema_pb.Value{

										Kind: &schema_pb.Value_Int64Value{Int64Value: logEntry.TsNs},

									}

									recordValue.Fields[SW_COLUMN_NAME_KEY] = &schema_pb.Value{

										Kind: &schema_pb.Value_BytesValue{BytesValue: logEntry.Key},

									}


									// Parse message data based on schema

									if hms.recordSchema == nil || len(hms.recordSchema.Fields) == 0 {

										// Fallback: No schema available, use "_value" for schema-less topics only

										if hms.isSchemaless() {

											recordValue.Fields[SW_COLUMN_NAME_VALUE] = &schema_pb.Value{

												Kind: &schema_pb.Value_BytesValue{BytesValue: logEntry.Data},

											}

										}

										return recordValue, "live_log", nil

									}


									// Use schema format to directly choose the right decoder

									// This avoids trying multiple decoders and improves performance

									var parsedRecord *schema_pb.RecordValue

									var err error


									switch hms.schemaFormat {

									case "AVRO":

										// AVRO format - use Avro decoder

										// Note: Avro decoding requires schema registry integration

										// For now, fall through to JSON as many Avro messages are also valid JSON

										parsedRecord, err = hms.parseJSONMessage(logEntry.Data)

									case "PROTOBUF":

										// PROTOBUF format - use protobuf decoder

										parsedRecord, err = hms.parseProtobufMessage(logEntry.Data)

									case "JSON_SCHEMA", "":

										// JSON_SCHEMA format or empty (default to JSON)

										// JSON is the most common format for schema registry

										parsedRecord, err = hms.parseJSONMessage(logEntry.Data)

										if err != nil {

											// Try protobuf as fallback

											parsedRecord, err = hms.parseProtobufMessage(logEntry.Data)

										}

									default:

										// Unknown format - try JSON first, then protobuf as fallback

										parsedRecord, err = hms.parseJSONMessage(logEntry.Data)

										if err != nil {

											parsedRecord, err = hms.parseProtobufMessage(logEntry.Data)

										}

									}


									if err == nil && parsedRecord != nil {

										// Successfully parsed, merge with system columns

										for fieldName, fieldValue := range parsedRecord.Fields {

											recordValue.Fields[fieldName] = fieldValue

										}

										return recordValue, "live_log", nil

									}


									// Fallback: If schema has a single field, map the raw data to it with type conversion

									if len(hms.recordSchema.Fields) == 1 {

										field := hms.recordSchema.Fields[0]

										convertedValue, convErr := hms.convertRawDataToSchemaValue(logEntry.Data, field.Type)

										if convErr == nil {

											recordValue.Fields[field.Name] = convertedValue

											return recordValue, "live_log", nil

										}

									}


									// Final fallback: treat as bytes field for schema-less topics only

									if hms.isSchemaless() {

										recordValue.Fields[SW_COLUMN_NAME_VALUE] = &schema_pb.Value{

											Kind: &schema_pb.Value_BytesValue{BytesValue: logEntry.Data},

										}

									}


									return recordValue, "live_log", nil

								}


								// convertLogEntryToRecordValueWithDecoded converts a filer_pb.LogEntry to schema_pb.RecordValue

								// using a pre-decoded DataMessage to avoid duplicate protobuf unmarshaling

								func (hms *HybridMessageScanner) convertLogEntryToRecordValueWithDecoded(logEntry *filer_pb.LogEntry, dataMessage *mq_pb.DataMessage) (*schema_pb.RecordValue, string, error) {

									// IMPORTANT: Check for schema-less topics FIRST

									// Schema-less topics (like _schemas) should store raw data directly in _value field

									if hms.isSchemaless() {

										recordValue := &schema_pb.RecordValue{

											Fields: make(map[string]*schema_pb.Value),

										}

										recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP] = &schema_pb.Value{

											Kind: &schema_pb.Value_Int64Value{Int64Value: logEntry.TsNs},

										}

										recordValue.Fields[SW_COLUMN_NAME_KEY] = &schema_pb.Value{

											Kind: &schema_pb.Value_BytesValue{BytesValue: logEntry.Key},

										}

										recordValue.Fields[SW_COLUMN_NAME_VALUE] = &schema_pb.Value{

											Kind: &schema_pb.Value_BytesValue{BytesValue: logEntry.Data},

										}

										return recordValue, "live_log", nil

									}


									// CRITICAL: The broker stores DataMessage.Value directly in LogEntry.Data

									// So we need to try unmarshaling LogEntry.Data as RecordValue first

									var recordValueBytes []byte


									if dataMessage != nil && len(dataMessage.Value) > 0 {

										// DataMessage has a Value field - use it

										recordValueBytes = dataMessage.Value

									} else {

										// DataMessage doesn't have Value, use LogEntry.Data directly

										// This is the normal case when broker stores messages

										recordValueBytes = logEntry.Data

									}


									// Try to unmarshal as RecordValue

									if len(recordValueBytes) > 0 {

										recordValue := &schema_pb.RecordValue{}

										if err := proto.Unmarshal(recordValueBytes, recordValue); err == nil {

											// Successfully unmarshaled as RecordValue


											// Ensure Fields map exists

											if recordValue.Fields == nil {

												recordValue.Fields = make(map[string]*schema_pb.Value)

											}


											// Add system columns from LogEntry

											recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP] = &schema_pb.Value{

												Kind: &schema_pb.Value_Int64Value{Int64Value: logEntry.TsNs},

											}

											recordValue.Fields[SW_COLUMN_NAME_KEY] = &schema_pb.Value{

												Kind: &schema_pb.Value_BytesValue{BytesValue: logEntry.Key},

											}


											return recordValue, "live_log", nil

										}

										// If unmarshaling as RecordValue fails, fall back to schema-aware parsing

									}


									// For cases where protobuf unmarshaling failed or data is empty,

									// attempt schema-aware parsing to try JSON, protobuf, and other formats

									return hms.parseRawMessageWithSchema(logEntry)

								}


								// parseJSONMessage attempts to parse raw data as JSON and map to schema fields

								func (hms *HybridMessageScanner) parseJSONMessage(data []byte) (*schema_pb.RecordValue, error) {

									// Try to parse as JSON

									var jsonData map[string]interface{}

									if err := json.Unmarshal(data, &jsonData); err != nil {

										return nil, fmt.Errorf("not valid JSON: %v", err)

									}


									recordValue := &schema_pb.RecordValue{

										Fields: make(map[string]*schema_pb.Value),

									}


									// Map JSON fields to schema fields

									for _, schemaField := range hms.recordSchema.Fields {

										fieldName := schemaField.Name

										if jsonValue, exists := jsonData[fieldName]; exists {

											schemaValue, err := hms.convertJSONValueToSchemaValue(jsonValue, schemaField.Type)

											if err != nil {

												// Log conversion error but continue with other fields

												continue

											}

											recordValue.Fields[fieldName] = schemaValue

										}

									}


									return recordValue, nil

								}


								// parseProtobufMessage attempts to parse raw data as protobuf RecordValue

								func (hms *HybridMessageScanner) parseProtobufMessage(data []byte) (*schema_pb.RecordValue, error) {

									// This might be a raw protobuf message that didn't parse correctly the first time

									// Try alternative protobuf unmarshaling approaches

									recordValue := &schema_pb.RecordValue{}


									// Strategy 1: Direct unmarshaling (might work if it's actually a RecordValue)

									if err := proto.Unmarshal(data, recordValue); err == nil {

										return recordValue, nil

									}


									// Strategy 2: Check if it's a different protobuf message type

									// For now, return error as we need more specific knowledge of MQ message formats

									return nil, fmt.Errorf("could not parse as protobuf RecordValue")

								}


								// convertRawDataToSchemaValue converts raw bytes to a specific schema type

								func (hms *HybridMessageScanner) convertRawDataToSchemaValue(data []byte, fieldType *schema_pb.Type) (*schema_pb.Value, error) {

									dataStr := string(data)


									switch fieldType.Kind.(type) {

									case *schema_pb.Type_ScalarType:

										scalarType := fieldType.GetScalarType()

										switch scalarType {

										case schema_pb.ScalarType_STRING:

											return &schema_pb.Value{

												Kind: &schema_pb.Value_StringValue{StringValue: dataStr},

											}, nil

										case schema_pb.ScalarType_INT32:

											if val, err := strconv.ParseInt(strings.TrimSpace(dataStr), 10, 32); err == nil {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_Int32Value{Int32Value: int32(val)},

												}, nil

											}

										case schema_pb.ScalarType_INT64:

											if val, err := strconv.ParseInt(strings.TrimSpace(dataStr), 10, 64); err == nil {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_Int64Value{Int64Value: val},

												}, nil

											}

										case schema_pb.ScalarType_FLOAT:

											if val, err := strconv.ParseFloat(strings.TrimSpace(dataStr), 32); err == nil {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_FloatValue{FloatValue: float32(val)},

												}, nil

											}

										case schema_pb.ScalarType_DOUBLE:

											if val, err := strconv.ParseFloat(strings.TrimSpace(dataStr), 64); err == nil {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_DoubleValue{DoubleValue: val},

												}, nil

											}

										case schema_pb.ScalarType_BOOL:

											lowerStr := strings.ToLower(strings.TrimSpace(dataStr))

											if lowerStr == "true" || lowerStr == "1" || lowerStr == "yes" {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_BoolValue{BoolValue: true},

												}, nil

											} else if lowerStr == "false" || lowerStr == "0" || lowerStr == "no" {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_BoolValue{BoolValue: false},

												}, nil

											}

										case schema_pb.ScalarType_BYTES:

											return &schema_pb.Value{

												Kind: &schema_pb.Value_BytesValue{BytesValue: data},

											}, nil

										}

									}


									return nil, fmt.Errorf("unsupported type conversion for %v", fieldType)

								}


								// convertJSONValueToSchemaValue converts a JSON value to schema_pb.Value based on schema type

								func (hms *HybridMessageScanner) convertJSONValueToSchemaValue(jsonValue interface{}, fieldType *schema_pb.Type) (*schema_pb.Value, error) {

									switch fieldType.Kind.(type) {

									case *schema_pb.Type_ScalarType:

										scalarType := fieldType.GetScalarType()

										switch scalarType {

										case schema_pb.ScalarType_STRING:

											if str, ok := jsonValue.(string); ok {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_StringValue{StringValue: str},

												}, nil

											}

											// Convert other types to string

											return &schema_pb.Value{

												Kind: &schema_pb.Value_StringValue{StringValue: fmt.Sprintf("%v", jsonValue)},

											}, nil

										case schema_pb.ScalarType_INT32:

											if num, ok := jsonValue.(float64); ok { // JSON numbers are float64

												return &schema_pb.Value{

													Kind: &schema_pb.Value_Int32Value{Int32Value: int32(num)},

												}, nil

											}

										case schema_pb.ScalarType_INT64:

											if num, ok := jsonValue.(float64); ok {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_Int64Value{Int64Value: int64(num)},

												}, nil

											}

										case schema_pb.ScalarType_FLOAT:

											if num, ok := jsonValue.(float64); ok {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_FloatValue{FloatValue: float32(num)},

												}, nil

											}

										case schema_pb.ScalarType_DOUBLE:

											if num, ok := jsonValue.(float64); ok {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_DoubleValue{DoubleValue: num},

												}, nil

											}

										case schema_pb.ScalarType_BOOL:

											if boolVal, ok := jsonValue.(bool); ok {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_BoolValue{BoolValue: boolVal},

												}, nil

											}

										case schema_pb.ScalarType_BYTES:

											if str, ok := jsonValue.(string); ok {

												return &schema_pb.Value{

													Kind: &schema_pb.Value_BytesValue{BytesValue: []byte(str)},

												}, nil

											}

										}

									}


									return nil, fmt.Errorf("incompatible JSON value type %T for schema type %v", jsonValue, fieldType)

								}


								// ConvertToSQLResult converts HybridScanResults to SQL query results

								func (hms *HybridMessageScanner) ConvertToSQLResult(results []HybridScanResult, columns []string) *QueryResult {

									if len(results) == 0 {

										return &QueryResult{

											Columns:  columns,

											Rows:     [][]sqltypes.Value{},

											Database: hms.topic.Namespace,

											Table:    hms.topic.Name,

										}

									}


									// Determine columns if not specified

									if len(columns) == 0 {

										columnSet := make(map[string]bool)

										for _, result := range results {

											for columnName := range result.Values {

												columnSet[columnName] = true

											}

										}


										columns = make([]string, 0, len(columnSet))

										for columnName := range columnSet {

											columns = append(columns, columnName)

										}


										// If no data columns were found, include system columns so we have something to display

										if len(columns) == 0 {

											columns = []string{SW_DISPLAY_NAME_TIMESTAMP, SW_COLUMN_NAME_KEY}

										}

									}


									// Convert to SQL rows

									rows := make([][]sqltypes.Value, len(results))

									for i, result := range results {

										row := make([]sqltypes.Value, len(columns))

										for j, columnName := range columns {

											switch columnName {

											case SW_COLUMN_NAME_SOURCE:

												row[j] = sqltypes.NewVarChar(result.Source)

											case SW_COLUMN_NAME_TIMESTAMP, SW_DISPLAY_NAME_TIMESTAMP:

												// Format timestamp as proper timestamp type instead of raw nanoseconds

												row[j] = hms.engine.formatTimestampColumn(result.Timestamp)

											case SW_COLUMN_NAME_KEY:

												row[j] = sqltypes.NewVarBinary(string(result.Key))

											default:

												if value, exists := result.Values[columnName]; exists {

													row[j] = convertSchemaValueToSQL(value)

												} else {

													row[j] = sqltypes.NULL

												}

											}

										}

										rows[i] = row

									}


									return &QueryResult{

										Columns:  columns,

										Rows:     rows,

										Database: hms.topic.Namespace,

										Table:    hms.topic.Name,

									}

								}


								// ConvertToSQLResultWithMixedColumns handles SELECT *, specific_columns queries

								// Combines auto-discovered columns (from *) with explicitly requested columns

								func (hms *HybridMessageScanner) ConvertToSQLResultWithMixedColumns(results []HybridScanResult, explicitColumns []string) *QueryResult {

									if len(results) == 0 {

										// For empty results, combine auto-discovered columns with explicit ones

										columnSet := make(map[string]bool)


										// Add explicit columns first

										for _, col := range explicitColumns {

											columnSet[col] = true

										}


										// Build final column list

										columns := make([]string, 0, len(columnSet))

										for col := range columnSet {

											columns = append(columns, col)

										}


										return &QueryResult{

											Columns:  columns,

											Rows:     [][]sqltypes.Value{},

											Database: hms.topic.Namespace,

											Table:    hms.topic.Name,

										}

									}


									// Auto-discover columns from data (like SELECT *)

									autoColumns := make(map[string]bool)

									for _, result := range results {

										for columnName := range result.Values {

											autoColumns[columnName] = true

										}

									}


									// Combine auto-discovered and explicit columns

									columnSet := make(map[string]bool)


									// Add auto-discovered columns first (regular data columns)

									for col := range autoColumns {

										columnSet[col] = true

									}


									// Add explicit columns (may include system columns like _source)

									for _, col := range explicitColumns {

										columnSet[col] = true

									}


									// Build final column list

									columns := make([]string, 0, len(columnSet))

									for col := range columnSet {

										columns = append(columns, col)

									}


									// If no data columns were found and no explicit columns specified, include system columns

									if len(columns) == 0 {

										columns = []string{SW_DISPLAY_NAME_TIMESTAMP, SW_COLUMN_NAME_KEY}

									}


									// Convert to SQL rows

									rows := make([][]sqltypes.Value, len(results))

									for i, result := range results {

										row := make([]sqltypes.Value, len(columns))

										for j, columnName := range columns {

											switch columnName {

											case SW_COLUMN_NAME_TIMESTAMP:

												row[j] = sqltypes.NewInt64(result.Timestamp)

											case SW_COLUMN_NAME_KEY:

												row[j] = sqltypes.NewVarBinary(string(result.Key))

											case SW_COLUMN_NAME_SOURCE:

												row[j] = sqltypes.NewVarChar(result.Source)

											default:

												// Regular data column

												if value, exists := result.Values[columnName]; exists {

													row[j] = convertSchemaValueToSQL(value)

												} else {

													row[j] = sqltypes.NULL

												}

											}

										}

										rows[i] = row

									}


									return &QueryResult{

										Columns:  columns,

										Rows:     rows,

										Database: hms.topic.Namespace,

										Table:    hms.topic.Name,

									}

								}


								// ReadParquetStatistics efficiently reads column statistics from parquet files

								// without scanning the full file content - uses parquet's built-in metadata

								func (h *HybridMessageScanner) ReadParquetStatistics(partitionPath string) ([]*ParquetFileStats, error) {

									var fileStats []*ParquetFileStats


									// Use the same chunk cache as the logstore package

									chunkCache := chunk_cache.NewChunkCacheInMemory(256)

									lookupFileIdFn := filer.LookupFn(h.filerClient)


									err := filer_pb.ReadDirAllEntries(context.Background(), h.filerClient, util.FullPath(partitionPath), "", func(entry *filer_pb.Entry, isLast bool) error {

										// Only process parquet files

										if entry.IsDirectory || !strings.HasSuffix(entry.Name, ".parquet") {

											return nil

										}


										// Extract statistics from this parquet file

										stats, err := h.extractParquetFileStats(entry, lookupFileIdFn, chunkCache)

										if err != nil {

											// Log error but continue processing other files

											fmt.Printf("Warning: failed to extract stats from %s: %v\n", entry.Name, err)

											return nil

										}


										if stats != nil {

											fileStats = append(fileStats, stats)

										}

										return nil

									})


									return fileStats, err

								}


								// extractParquetFileStats extracts column statistics from a single parquet file

								func (h *HybridMessageScanner) extractParquetFileStats(entry *filer_pb.Entry, lookupFileIdFn wdclient.LookupFileIdFunctionType, chunkCache *chunk_cache.ChunkCacheInMemory) (*ParquetFileStats, error) {

									// Create reader for the parquet file

									fileSize := filer.FileSize(entry)

									visibleIntervals, _ := filer.NonOverlappingVisibleIntervals(context.Background(), lookupFileIdFn, entry.Chunks, 0, int64(fileSize))

									chunkViews := filer.ViewFromVisibleIntervals(visibleIntervals, 0, int64(fileSize))

									readerCache := filer.NewReaderCache(32, chunkCache, lookupFileIdFn)

									readerAt := filer.NewChunkReaderAtFromClient(context.Background(), readerCache, chunkViews, int64(fileSize))


									// Create parquet reader - this only reads metadata, not data

									parquetReader := parquet.NewReader(readerAt)

									defer parquetReader.Close()


									fileView := parquetReader.File()


									fileStats := &ParquetFileStats{

										FileName:    entry.Name,

										RowCount:    fileView.NumRows(),

										ColumnStats: make(map[string]*ParquetColumnStats),

									}

									// Populate optional min/max from filer extended attributes (writer stores ns timestamps)

									if entry != nil && entry.Extended != nil {

										if minBytes, ok := entry.Extended[mq.ExtendedAttrTimestampMin]; ok && len(minBytes) == 8 {

											fileStats.MinTimestampNs = int64(binary.BigEndian.Uint64(minBytes))

										}

										if maxBytes, ok := entry.Extended[mq.ExtendedAttrTimestampMax]; ok && len(maxBytes) == 8 {

											fileStats.MaxTimestampNs = int64(binary.BigEndian.Uint64(maxBytes))

										}

									}


									// Get schema information

									schema := fileView.Schema()


									// Process each row group

									rowGroups := fileView.RowGroups()

									for _, rowGroup := range rowGroups {

										columnChunks := rowGroup.ColumnChunks()


										// Process each column chunk

										for i, chunk := range columnChunks {

											// Get column name from schema

											columnName := h.getColumnNameFromSchema(schema, i)

											if columnName == "" {

												continue

											}


											// Try to get column statistics

											columnIndex, err := chunk.ColumnIndex()

											if err != nil {

												// No column index available - skip this column

												continue

											}


											// Extract min/max values from the first page (for simplicity)

											// In a more sophisticated implementation, we could aggregate across all pages

											numPages := columnIndex.NumPages()

											if numPages == 0 {

												continue

											}


											minParquetValue := columnIndex.MinValue(0)

											maxParquetValue := columnIndex.MaxValue(numPages - 1)

											nullCount := int64(0)


											// Aggregate null counts across all pages

											for pageIdx := 0; pageIdx < numPages; pageIdx++ {

												nullCount += columnIndex.NullCount(pageIdx)

											}


											// Convert parquet values to schema_pb.Value

											minValue, err := h.convertParquetValueToSchemaValue(minParquetValue)

											if err != nil {

												continue

											}


											maxValue, err := h.convertParquetValueToSchemaValue(maxParquetValue)

											if err != nil {

												continue

											}


											// Store column statistics (aggregate across row groups if column already exists)

											if existingStats, exists := fileStats.ColumnStats[columnName]; exists {

												// Update existing statistics

												if h.compareSchemaValues(minValue, existingStats.MinValue) < 0 {

													existingStats.MinValue = minValue

												}

												if h.compareSchemaValues(maxValue, existingStats.MaxValue) > 0 {

													existingStats.MaxValue = maxValue

												}

												existingStats.NullCount += nullCount

											} else {

												// Create new column statistics

												fileStats.ColumnStats[columnName] = &ParquetColumnStats{

													ColumnName: columnName,

													MinValue:   minValue,

													MaxValue:   maxValue,

													NullCount:  nullCount,

													RowCount:   rowGroup.NumRows(),

												}

											}

										}

									}


									return fileStats, nil

								}


								// getColumnNameFromSchema extracts column name from parquet schema by index

								func (h *HybridMessageScanner) getColumnNameFromSchema(schema *parquet.Schema, columnIndex int) string {

									// Get the leaf columns in order

									var columnNames []string

									h.collectColumnNames(schema.Fields(), &columnNames)


									if columnIndex >= 0 && columnIndex < len(columnNames) {

										return columnNames[columnIndex]

									}

									return ""

								}


								// collectColumnNames recursively collects leaf column names from schema

								func (h *HybridMessageScanner) collectColumnNames(fields []parquet.Field, names *[]string) {

									for _, field := range fields {

										if len(field.Fields()) == 0 {

											// This is a leaf field (no sub-fields)

											*names = append(*names, field.Name())

										} else {

											// This is a group - recurse

											h.collectColumnNames(field.Fields(), names)

										}

									}

								}


								// convertParquetValueToSchemaValue converts parquet.Value to schema_pb.Value

								func (h *HybridMessageScanner) convertParquetValueToSchemaValue(pv parquet.Value) (*schema_pb.Value, error) {

									switch pv.Kind() {

									case parquet.Boolean:

										return &schema_pb.Value{Kind: &schema_pb.Value_BoolValue{BoolValue: pv.Boolean()}}, nil

									case parquet.Int32:

										return &schema_pb.Value{Kind: &schema_pb.Value_Int32Value{Int32Value: pv.Int32()}}, nil

									case parquet.Int64:

										return &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: pv.Int64()}}, nil

									case parquet.Float:

										return &schema_pb.Value{Kind: &schema_pb.Value_FloatValue{FloatValue: pv.Float()}}, nil

									case parquet.Double:

										return &schema_pb.Value{Kind: &schema_pb.Value_DoubleValue{DoubleValue: pv.Double()}}, nil

									case parquet.ByteArray:

										return &schema_pb.Value{Kind: &schema_pb.Value_BytesValue{BytesValue: pv.ByteArray()}}, nil

									default:

										return nil, fmt.Errorf("unsupported parquet value kind: %v", pv.Kind())

									}

								}


								// compareSchemaValues compares two schema_pb.Value objects

								func (h *HybridMessageScanner) compareSchemaValues(v1, v2 *schema_pb.Value) int {

									if v1 == nil && v2 == nil {

										return 0

									}

									if v1 == nil {

										return -1

									}

									if v2 == nil {

										return 1

									}


									// Extract raw values and compare

									raw1 := h.extractRawValueFromSchema(v1)

									raw2 := h.extractRawValueFromSchema(v2)


									return h.compareRawValues(raw1, raw2)

								}


								// extractRawValueFromSchema extracts the raw value from schema_pb.Value

								func (h *HybridMessageScanner) extractRawValueFromSchema(value *schema_pb.Value) interface{} {

									switch v := value.Kind.(type) {

									case *schema_pb.Value_BoolValue:

										return v.BoolValue

									case *schema_pb.Value_Int32Value:

										return v.Int32Value

									case *schema_pb.Value_Int64Value:

										return v.Int64Value

									case *schema_pb.Value_FloatValue:

										return v.FloatValue

									case *schema_pb.Value_DoubleValue:

										return v.DoubleValue

									case *schema_pb.Value_BytesValue:

										return string(v.BytesValue) // Convert to string for comparison

									case *schema_pb.Value_StringValue:

										return v.StringValue

									}

									return nil

								}


								// compareRawValues compares two raw values

								func (h *HybridMessageScanner) compareRawValues(v1, v2 interface{}) int {

									// Handle nil cases

									if v1 == nil && v2 == nil {

										return 0

									}

									if v1 == nil {

										return -1

									}

									if v2 == nil {

										return 1

									}


									// Compare based on type

									switch val1 := v1.(type) {

									case bool:

										if val2, ok := v2.(bool); ok {

											if val1 == val2 {

												return 0

											}

											if val1 {

												return 1

											}

											return -1

										}

									case int32:

										if val2, ok := v2.(int32); ok {

											if val1 < val2 {

												return -1

											} else if val1 > val2 {

												return 1

											}

											return 0

										}

									case int64:

										if val2, ok := v2.(int64); ok {

											if val1 < val2 {

												return -1

											} else if val1 > val2 {

												return 1

											}

											return 0

										}

									case float32:

										if val2, ok := v2.(float32); ok {

											if val1 < val2 {

												return -1

											} else if val1 > val2 {

												return 1

											}

											return 0

										}

									case float64:

										if val2, ok := v2.(float64); ok {

											if val1 < val2 {

												return -1

											} else if val1 > val2 {

												return 1

											}

											return 0

										}

									case string:

										if val2, ok := v2.(string); ok {

											if val1 < val2 {

												return -1

											} else if val1 > val2 {

												return 1

											}

											return 0

										}

									}


									// Default: try string comparison

									str1 := fmt.Sprintf("%v", v1)

									str2 := fmt.Sprintf("%v", v2)

									if str1 < str2 {

										return -1

									} else if str1 > str2 {

										return 1

									}

									return 0

								}


								// streamingMerge merges multiple sorted data sources using a heap-based approach

								// This provides memory-efficient merging without loading all data into memory

								func (hms *HybridMessageScanner) streamingMerge(dataSources []StreamingDataSource, limit int) ([]HybridScanResult, error) {

									if len(dataSources) == 0 {

										return nil, nil

									}


									var results []HybridScanResult

									mergeHeap := &StreamingMergeHeap{}

									heap.Init(mergeHeap)


									// Initialize heap with first item from each data source

									for i, source := range dataSources {

										if source.HasMore() {

											result, err := source.Next()

											if err != nil {

												// Close all sources and return error

												for _, s := range dataSources {

													s.Close()

												}

												return nil, fmt.Errorf("failed to read from data source %d: %v", i, err)

											}

											if result != nil {

												heap.Push(mergeHeap, &StreamingMergeItem{

													Result:     result,

													SourceID:   i,

													DataSource: source,

												})

											}

										}

									}


									// Process results in chronological order

									for mergeHeap.Len() > 0 {

										// Get next chronologically ordered result

										item := heap.Pop(mergeHeap).(*StreamingMergeItem)

										results = append(results, *item.Result)


										// Check limit

										if limit > 0 && len(results) >= limit {

											break

										}


										// Try to get next item from the same data source

										if item.DataSource.HasMore() {

											nextResult, err := item.DataSource.Next()

											if err != nil {

												// Log error but continue with other sources

												fmt.Printf("Warning: Error reading next item from source %d: %v\n", item.SourceID, err)

											} else if nextResult != nil {

												heap.Push(mergeHeap, &StreamingMergeItem{

													Result:     nextResult,

													SourceID:   item.SourceID,

													DataSource: item.DataSource,

												})

											}

										}

									}


									// Close all data sources

									for _, source := range dataSources {

										source.Close()

									}


									return results, nil

								}


								// SliceDataSource wraps a pre-loaded slice of results as a StreamingDataSource

								// This is used for unflushed data that is already loaded into memory

								type SliceDataSource struct {

									results []HybridScanResult

									index   int

								}


								func NewSliceDataSource(results []HybridScanResult) *SliceDataSource {

									return &SliceDataSource{

										results: results,

										index:   0,

									}

								}


								func (s *SliceDataSource) Next() (*HybridScanResult, error) {

									if s.index >= len(s.results) {

										return nil, nil

									}

									result := &s.results[s.index]

									s.index++

									return result, nil

								}


								func (s *SliceDataSource) HasMore() bool {

									return s.index < len(s.results)

								}


								func (s *SliceDataSource) Close() error {

									return nil // Nothing to clean up for slice-based source

								}


								// StreamingFlushedDataSource provides streaming access to flushed data

								type StreamingFlushedDataSource struct {

									hms          *HybridMessageScanner

									partition    topic.Partition

									options      HybridScanOptions

									mergedReadFn func(startPosition log_buffer.MessagePosition, stopTsNs int64, eachLogEntryFn log_buffer.EachLogEntryFuncType) (lastReadPosition log_buffer.MessagePosition, isDone bool, err error)

									resultChan   chan *HybridScanResult

									errorChan    chan error

									doneChan     chan struct{}

									started      bool

									finished     bool

									closed       int32 // atomic flag to prevent double close

									mu           sync.RWMutex

								}


								func NewStreamingFlushedDataSource(hms *HybridMessageScanner, partition topic.Partition, options HybridScanOptions) *StreamingFlushedDataSource {

									mergedReadFn := logstore.GenMergedReadFunc(hms.filerClient, hms.topic, partition)


									return &StreamingFlushedDataSource{

										hms:          hms,

										partition:    partition,

										options:      options,

										mergedReadFn: mergedReadFn,

										resultChan:   make(chan *HybridScanResult, 100), // Buffer for better performance

										errorChan:    make(chan error, 1),

										doneChan:     make(chan struct{}),

										started:      false,

										finished:     false,

									}

								}


								func (s *StreamingFlushedDataSource) startStreaming() {

									if s.started {

										return

									}

									s.started = true


									go func() {

										defer func() {

											// Use atomic flag to ensure channels are only closed once

											if atomic.CompareAndSwapInt32(&s.closed, 0, 1) {

												close(s.resultChan)

												close(s.errorChan)

												close(s.doneChan)

											}

										}()


										// Set up time range for scanning

										startTime := time.Unix(0, s.options.StartTimeNs)

										if s.options.StartTimeNs == 0 {

											startTime = time.Unix(0, 0)

										}


										stopTsNs := s.options.StopTimeNs

										// For SQL queries, stopTsNs = 0 means "no stop time restriction"

										// This is different from message queue consumers which want to stop at "now"

										// We detect SQL context by checking if we have a predicate function

										if stopTsNs == 0 && s.options.Predicate == nil {

											// Only set to current time for non-SQL queries (message queue consumers)

											stopTsNs = time.Now().UnixNano()

										}

										// If stopTsNs is still 0, it means this is a SQL query that wants unrestricted scanning


										// Message processing function

										eachLogEntryFn := func(logEntry *filer_pb.LogEntry) (isDone bool, err error) {

											// Pre-decode DataMessage for reuse in both control check and conversion

											var dataMessage *mq_pb.DataMessage

											if len(logEntry.Data) > 0 {

												dataMessage = &mq_pb.DataMessage{}

												if err := proto.Unmarshal(logEntry.Data, dataMessage); err != nil {

													dataMessage = nil // Failed to decode, treat as raw data

												}

											}


											// Skip control entries without actual data

											if s.hms.isControlEntryWithDecoded(logEntry, dataMessage) {

												return false, nil // Skip this entry

											}


											// Convert log entry to schema_pb.RecordValue for consistent processing

											recordValue, source, convertErr := s.hms.convertLogEntryToRecordValueWithDecoded(logEntry, dataMessage)

											if convertErr != nil {

												return false, fmt.Errorf("failed to convert log entry: %v", convertErr)

											}


											// Apply predicate filtering (WHERE clause)

											if s.options.Predicate != nil && !s.options.Predicate(recordValue) {

												return false, nil // Skip this message

											}


											// Extract system columns

											timestamp := recordValue.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()

											key := recordValue.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()


											// Apply column projection

											values := make(map[string]*schema_pb.Value)

											if len(s.options.Columns) == 0 {

												// Select all columns (excluding system columns from user view)

												for name, value := range recordValue.Fields {

													if name != SW_COLUMN_NAME_TIMESTAMP && name != SW_COLUMN_NAME_KEY {

														values[name] = value

													}

												}

											} else {

												// Select specified columns only

												for _, columnName := range s.options.Columns {

													if value, exists := recordValue.Fields[columnName]; exists {

														values[columnName] = value

													}

												}

											}


											result := &HybridScanResult{

												Values:    values,

												Timestamp: timestamp,

												Key:       key,

												Source:    source,

											}


											// Check if already closed before trying to send

											if atomic.LoadInt32(&s.closed) != 0 {

												return true, nil // Stop processing if closed

											}


											// Send result to channel with proper handling of closed channels

											select {

											case s.resultChan <- result:

												return false, nil

											case <-s.doneChan:

												return true, nil // Stop processing if closed

											default:

												// Check again if closed (in case it was closed between the atomic check and select)

												if atomic.LoadInt32(&s.closed) != 0 {

													return true, nil

												}

												// If not closed, try sending again with blocking select

												select {

												case s.resultChan <- result:

													return false, nil

												case <-s.doneChan:

													return true, nil

												}

											}

										}


										// Start scanning from the specified position

										startPosition := log_buffer.MessagePosition{Time: startTime}

										_, _, err := s.mergedReadFn(startPosition, stopTsNs, eachLogEntryFn)


										if err != nil {

											// Only try to send error if not already closed

											if atomic.LoadInt32(&s.closed) == 0 {

												select {

												case s.errorChan <- fmt.Errorf("flushed data scan failed: %v", err):

												case <-s.doneChan:

												default:

													// Channel might be full or closed, ignore

												}

											}

										}


										s.finished = true

									}()

								}


								func (s *StreamingFlushedDataSource) Next() (*HybridScanResult, error) {

									if !s.started {

										s.startStreaming()

									}


									select {

									case result, ok := <-s.resultChan:

										if !ok {

											return nil, nil // No more results

										}

										return result, nil

									case err := <-s.errorChan:

										return nil, err

									case <-s.doneChan:

										return nil, nil

									}

								}


								func (s *StreamingFlushedDataSource) HasMore() bool {

									if !s.started {

										return true // Haven't started yet, so potentially has data

									}

									return !s.finished || len(s.resultChan) > 0

								}


								func (s *StreamingFlushedDataSource) Close() error {

									// Use atomic flag to ensure channels are only closed once

									if atomic.CompareAndSwapInt32(&s.closed, 0, 1) {

										close(s.doneChan)

										close(s.resultChan)

										close(s.errorChan)

									}

									return nil

								}


								// mergeSort efficiently sorts HybridScanResult slice by timestamp using merge sort algorithm

								func (hms *HybridMessageScanner) mergeSort(results []HybridScanResult, left, right int) {

									if left < right {

										mid := left + (right-left)/2


										// Recursively sort both halves

										hms.mergeSort(results, left, mid)

										hms.mergeSort(results, mid+1, right)


										// Merge the sorted halves

										hms.merge(results, left, mid, right)

									}

								}


								// merge combines two sorted subarrays into a single sorted array

								func (hms *HybridMessageScanner) merge(results []HybridScanResult, left, mid, right int) {

									// Create temporary arrays for the two subarrays

									leftArray := make([]HybridScanResult, mid-left+1)

									rightArray := make([]HybridScanResult, right-mid)


									// Copy data to temporary arrays

									copy(leftArray, results[left:mid+1])

									copy(rightArray, results[mid+1:right+1])


									// Merge the temporary arrays back into results[left..right]

									i, j, k := 0, 0, left


									for i < len(leftArray) && j < len(rightArray) {

										if leftArray[i].Timestamp <= rightArray[j].Timestamp {

											results[k] = leftArray[i]

											i++

										} else {

											results[k] = rightArray[j]

											j++

										}

										k++

									}


									// Copy remaining elements of leftArray, if any

									for i < len(leftArray) {

										results[k] = leftArray[i]

										i++

										k++

									}


									// Copy remaining elements of rightArray, if any

									for j < len(rightArray) {

										results[k] = rightArray[j]

										j++

										k++

									}

								}