refactor

6 months ago · e385f0ce7d
6 changed files with 1036 additions and 170 deletions
--- a/weed/query/engine/aggregations.go
+++ b/weed/query/engine/aggregations.go
@ -0,0 +1,551 @@
+package engine
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
+	"github.com/seaweedfs/seaweedfs/weed/query/sqltypes"
+	"github.com/xwb1989/sqlparser"
+)
+
+// AggregationSpec defines an aggregation function to be computed
+type AggregationSpec struct {
+	Function string // COUNT, SUM, AVG, MIN, MAX
+	Column   string // Column name, or "*" for COUNT(*)
+	Alias    string // Optional alias for the result column
+	Distinct bool   // Support for DISTINCT keyword
+}
+
+// AggregationResult holds the computed result of an aggregation
+type AggregationResult struct {
+	Count int64
+	Sum   float64
+	Min   interface{}
+	Max   interface{}
+}
+
+// AggregationStrategy represents the strategy for executing aggregations
+type AggregationStrategy struct {
+	CanUseFastPath   bool
+	Reason           string
+	UnsupportedSpecs []AggregationSpec
+}
+
+// TopicDataSources represents the data sources available for a topic
+type TopicDataSources struct {
+	ParquetFiles    map[string][]*ParquetFileStats // partitionPath -> parquet file stats
+	ParquetRowCount int64
+	LiveLogRowCount int64
+	PartitionsCount int
+}
+
+// FastPathOptimizer handles fast path aggregation optimization decisions
+type FastPathOptimizer struct {
+	engine *SQLEngine
+}
+
+// NewFastPathOptimizer creates a new fast path optimizer
+func NewFastPathOptimizer(engine *SQLEngine) *FastPathOptimizer {
+	return &FastPathOptimizer{engine: engine}
+}
+
+// DetermineStrategy analyzes aggregations and determines if fast path can be used
+func (opt *FastPathOptimizer) DetermineStrategy(aggregations []AggregationSpec) AggregationStrategy {
+	strategy := AggregationStrategy{
+		CanUseFastPath:   true,
+		Reason:           "all_aggregations_supported",
+		UnsupportedSpecs: []AggregationSpec{},
+	}
+
+	for _, spec := range aggregations {
+		if !opt.engine.canUseParquetStatsForAggregation(spec) {
+			strategy.CanUseFastPath = false
+			strategy.Reason = "unsupported_aggregation_functions"
+			strategy.UnsupportedSpecs = append(strategy.UnsupportedSpecs, spec)
+		}
+	}
+
+	return strategy
+}
+
+// CollectDataSources gathers information about available data sources for a topic
+func (opt *FastPathOptimizer) CollectDataSources(ctx context.Context, hybridScanner *HybridMessageScanner) (*TopicDataSources, error) {
+	dataSources := &TopicDataSources{
+		ParquetFiles:    make(map[string][]*ParquetFileStats),
+		ParquetRowCount: 0,
+		LiveLogRowCount: 0,
+		PartitionsCount: 0,
+	}
+
+	// Discover partitions for the topic
+	relativePartitions, err := opt.engine.discoverTopicPartitions(hybridScanner.topic.Namespace, hybridScanner.topic.Name)
+	if err != nil {
+		return dataSources, DataSourceError{
+			Source: "partition_discovery",
+			Cause:  err,
+		}
+	}
+
+	topicBasePath := fmt.Sprintf("/topics/%s/%s", hybridScanner.topic.Namespace, hybridScanner.topic.Name)
+
+	// Collect stats from each partition
+	for _, relPartition := range relativePartitions {
+		partitionPath := fmt.Sprintf("%s/%s", topicBasePath, relPartition)
+
+		// Read parquet file statistics
+		parquetStats, err := hybridScanner.ReadParquetStatistics(partitionPath)
+		if err == nil && len(parquetStats) > 0 {
+			dataSources.ParquetFiles[partitionPath] = parquetStats
+			for _, stat := range parquetStats {
+				dataSources.ParquetRowCount += stat.RowCount
+			}
+		}
+
+		// Count live log files
+		liveLogCount, _ := opt.engine.countLiveLogFiles(partitionPath, dataSources.ParquetFiles[partitionPath])
+		dataSources.LiveLogRowCount += liveLogCount
+	}
+
+	dataSources.PartitionsCount = len(relativePartitions)
+
+	return dataSources, nil
+}
+
+// AggregationComputer handles the computation of aggregations using fast path
+type AggregationComputer struct {
+	engine *SQLEngine
+}
+
+// NewAggregationComputer creates a new aggregation computer
+func NewAggregationComputer(engine *SQLEngine) *AggregationComputer {
+	return &AggregationComputer{engine: engine}
+}
+
+// ComputeFastPathAggregations computes aggregations using parquet statistics and live log data
+func (comp *AggregationComputer) ComputeFastPathAggregations(
+	ctx context.Context,
+	aggregations []AggregationSpec,
+	dataSources *TopicDataSources,
+	partitions []string,
+) ([]AggregationResult, error) {
+
+	aggResults := make([]AggregationResult, len(aggregations))
+
+	for i, spec := range aggregations {
+		switch spec.Function {
+		case "COUNT":
+			if spec.Column == "*" {
+				aggResults[i].Count = dataSources.ParquetRowCount + dataSources.LiveLogRowCount
+			} else {
+				// For specific columns, we might need to account for NULLs in the future
+				aggResults[i].Count = dataSources.ParquetRowCount + dataSources.LiveLogRowCount
+			}
+
+		case "MIN":
+			globalMin, err := comp.computeGlobalMin(spec, dataSources, partitions)
+			if err != nil {
+				return nil, AggregationError{
+					Operation: spec.Function,
+					Column:    spec.Column,
+					Cause:     err,
+				}
+			}
+			aggResults[i].Min = globalMin
+
+		case "MAX":
+			globalMax, err := comp.computeGlobalMax(spec, dataSources, partitions)
+			if err != nil {
+				return nil, AggregationError{
+					Operation: spec.Function,
+					Column:    spec.Column,
+					Cause:     err,
+				}
+			}
+			aggResults[i].Max = globalMax
+
+		default:
+			return nil, OptimizationError{
+				Strategy: "fast_path_aggregation",
+				Reason:   fmt.Sprintf("unsupported aggregation function: %s", spec.Function),
+			}
+		}
+	}
+
+	return aggResults, nil
+}
+
+// computeGlobalMin computes the global minimum value across all data sources
+func (comp *AggregationComputer) computeGlobalMin(spec AggregationSpec, dataSources *TopicDataSources, partitions []string) (interface{}, error) {
+	var globalMin interface{}
+	var globalMinValue *schema_pb.Value
+	hasParquetStats := false
+
+	// Step 1: Get minimum from parquet statistics
+	for _, fileStats := range dataSources.ParquetFiles {
+		for _, fileStat := range fileStats {
+			// Try case-insensitive column lookup
+			var colStats *ParquetColumnStats
+			var found bool
+
+			// First try exact match
+			if stats, exists := fileStat.ColumnStats[spec.Column]; exists {
+				colStats = stats
+				found = true
+			} else {
+				// Try case-insensitive lookup
+				for colName, stats := range fileStat.ColumnStats {
+					if strings.EqualFold(colName, spec.Column) {
+						colStats = stats
+						found = true
+						break
+					}
+				}
+			}
+
+			if found && colStats != nil && colStats.MinValue != nil {
+				if globalMinValue == nil || comp.engine.compareValues(colStats.MinValue, globalMinValue) < 0 {
+					globalMinValue = colStats.MinValue
+					extractedValue := comp.engine.extractRawValue(colStats.MinValue)
+					if extractedValue != nil {
+						globalMin = extractedValue
+						hasParquetStats = true
+					}
+				}
+			}
+		}
+	}
+
+	// Step 2: Get minimum from live log data (only if no live logs or if we need to compare)
+	if dataSources.LiveLogRowCount > 0 {
+		for _, partition := range partitions {
+			partitionParquetSources := make(map[string]bool)
+			if partitionFileStats, exists := dataSources.ParquetFiles[partition]; exists {
+				partitionParquetSources = comp.engine.extractParquetSourceFiles(partitionFileStats)
+			}
+
+			liveLogMin, _, err := comp.engine.computeLiveLogMinMax(partition, spec.Column, partitionParquetSources)
+			if err != nil {
+				continue // Skip partitions with errors
+			}
+
+			if liveLogMin != nil {
+				if globalMin == nil {
+					globalMin = liveLogMin
+				} else {
+					liveLogSchemaValue := comp.engine.convertRawValueToSchemaValue(liveLogMin)
+					if liveLogSchemaValue != nil && comp.engine.compareValues(liveLogSchemaValue, globalMinValue) < 0 {
+						globalMin = liveLogMin
+						globalMinValue = liveLogSchemaValue
+					}
+				}
+			}
+		}
+	}
+
+	// Step 3: Handle system columns if no regular data found
+	if globalMin == nil && !hasParquetStats {
+		globalMin = comp.engine.getSystemColumnGlobalMin(spec.Column, dataSources.ParquetFiles)
+	}
+
+	return globalMin, nil
+}
+
+// computeGlobalMax computes the global maximum value across all data sources
+func (comp *AggregationComputer) computeGlobalMax(spec AggregationSpec, dataSources *TopicDataSources, partitions []string) (interface{}, error) {
+	var globalMax interface{}
+	var globalMaxValue *schema_pb.Value
+	hasParquetStats := false
+
+	// Step 1: Get maximum from parquet statistics
+	for _, fileStats := range dataSources.ParquetFiles {
+		for _, fileStat := range fileStats {
+			// Try case-insensitive column lookup
+			var colStats *ParquetColumnStats
+			var found bool
+
+			// First try exact match
+			if stats, exists := fileStat.ColumnStats[spec.Column]; exists {
+				colStats = stats
+				found = true
+			} else {
+				// Try case-insensitive lookup
+				for colName, stats := range fileStat.ColumnStats {
+					if strings.EqualFold(colName, spec.Column) {
+						colStats = stats
+						found = true
+						break
+					}
+				}
+			}
+
+			if found && colStats != nil && colStats.MaxValue != nil {
+				if globalMaxValue == nil || comp.engine.compareValues(colStats.MaxValue, globalMaxValue) > 0 {
+					globalMaxValue = colStats.MaxValue
+					extractedValue := comp.engine.extractRawValue(colStats.MaxValue)
+					if extractedValue != nil {
+						globalMax = extractedValue
+						hasParquetStats = true
+					}
+				}
+			}
+		}
+	}
+
+	// Step 2: Get maximum from live log data (only if live logs exist)
+	if dataSources.LiveLogRowCount > 0 {
+		for _, partition := range partitions {
+			partitionParquetSources := make(map[string]bool)
+			if partitionFileStats, exists := dataSources.ParquetFiles[partition]; exists {
+				partitionParquetSources = comp.engine.extractParquetSourceFiles(partitionFileStats)
+			}
+
+			_, liveLogMax, err := comp.engine.computeLiveLogMinMax(partition, spec.Column, partitionParquetSources)
+			if err != nil {
+				continue // Skip partitions with errors
+			}
+
+			if liveLogMax != nil {
+				if globalMax == nil {
+					globalMax = liveLogMax
+				} else {
+					liveLogSchemaValue := comp.engine.convertRawValueToSchemaValue(liveLogMax)
+					if liveLogSchemaValue != nil && comp.engine.compareValues(liveLogSchemaValue, globalMaxValue) > 0 {
+						globalMax = liveLogMax
+						globalMaxValue = liveLogSchemaValue
+					}
+				}
+			}
+		}
+	}
+
+	// Step 3: Handle system columns if no regular data found
+	if globalMax == nil && !hasParquetStats {
+		globalMax = comp.engine.getSystemColumnGlobalMax(spec.Column, dataSources.ParquetFiles)
+	}
+
+	return globalMax, nil
+}
+
+// executeAggregationQuery handles SELECT queries with aggregation functions
+func (e *SQLEngine) executeAggregationQuery(ctx context.Context, hybridScanner *HybridMessageScanner, aggregations []AggregationSpec, stmt *sqlparser.Select) (*QueryResult, error) {
+	// Parse WHERE clause for filtering
+	var predicate func(*schema_pb.RecordValue) bool
+	var err error
+	if stmt.Where != nil {
+		predicate, err = e.buildPredicate(stmt.Where.Expr)
+		if err != nil {
+			return &QueryResult{Error: err}, err
+		}
+	}
+
+	// Extract time filters for optimization
+	startTimeNs, stopTimeNs := int64(0), int64(0)
+	if stmt.Where != nil {
+		startTimeNs, stopTimeNs = e.extractTimeFilters(stmt.Where.Expr)
+	}
+
+	// FAST PATH: Try to use parquet statistics for optimization
+	// This can be ~130x faster than scanning all data
+	if stmt.Where == nil { // Only optimize when no complex WHERE clause
+		fastResult, canOptimize := e.tryFastParquetAggregation(ctx, hybridScanner, aggregations)
+		if canOptimize {
+			fmt.Printf("Using fast hybrid statistics for aggregation (parquet stats + live log counts)\n")
+			return fastResult, nil
+		}
+	}
+
+	// SLOW PATH: Fall back to full table scan
+	fmt.Printf("Using full table scan for aggregation (parquet optimization not applicable)\n")
+
+	// Build scan options for full table scan (aggregations need all data)
+	hybridScanOptions := HybridScanOptions{
+		StartTimeNs: startTimeNs,
+		StopTimeNs:  stopTimeNs,
+		Limit:       0, // No limit for aggregations - need all data
+		Predicate:   predicate,
+	}
+
+	// Execute the hybrid scan to get all matching records
+	results, err := hybridScanner.Scan(ctx, hybridScanOptions)
+	if err != nil {
+		return &QueryResult{Error: err}, err
+	}
+
+	// Compute aggregations
+	aggResults := e.computeAggregations(results, aggregations)
+
+	// Build result set
+	columns := make([]string, len(aggregations))
+	row := make([]sqltypes.Value, len(aggregations))
+
+	for i, spec := range aggregations {
+		columns[i] = spec.Alias
+		row[i] = e.formatAggregationResult(spec, aggResults[i])
+	}
+
+	return &QueryResult{
+		Columns: columns,
+		Rows:    [][]sqltypes.Value{row},
+	}, nil
+}
+
+// tryFastParquetAggregation attempts to compute aggregations using hybrid approach:
+// - Use parquet metadata for parquet files
+// - Count live log files for live data
+// - Combine both for accurate results per partition
+// Returns (result, canOptimize) where canOptimize=true means the hybrid fast path was used
+func (e *SQLEngine) tryFastParquetAggregation(ctx context.Context, hybridScanner *HybridMessageScanner, aggregations []AggregationSpec) (*QueryResult, bool) {
+	// Use the new modular components
+	optimizer := NewFastPathOptimizer(e)
+	computer := NewAggregationComputer(e)
+
+	// Step 1: Determine strategy
+	strategy := optimizer.DetermineStrategy(aggregations)
+	if !strategy.CanUseFastPath {
+		return nil, false
+	}
+
+	// Step 2: Collect data sources
+	dataSources, err := optimizer.CollectDataSources(ctx, hybridScanner)
+	if err != nil {
+		return nil, false
+	}
+
+	// Build partition list for aggregation computer
+	relativePartitions, err := e.discoverTopicPartitions(hybridScanner.topic.Namespace, hybridScanner.topic.Name)
+	if err != nil {
+		return nil, false
+	}
+
+	topicBasePath := fmt.Sprintf("/topics/%s/%s", hybridScanner.topic.Namespace, hybridScanner.topic.Name)
+	partitions := make([]string, len(relativePartitions))
+	for i, relPartition := range relativePartitions {
+		partitions[i] = fmt.Sprintf("%s/%s", topicBasePath, relPartition)
+	}
+
+	// Debug: Show the hybrid optimization results
+	if dataSources.ParquetRowCount > 0 || dataSources.LiveLogRowCount > 0 {
+		partitionsWithLiveLogs := 0
+		if dataSources.LiveLogRowCount > 0 {
+			partitionsWithLiveLogs = 1 // Simplified for now
+		}
+		fmt.Printf("Hybrid fast aggregation with deduplication: %d parquet rows + %d deduplicated live log rows from %d partitions\n",
+			dataSources.ParquetRowCount, dataSources.LiveLogRowCount, partitionsWithLiveLogs)
+	}
+
+	// Step 3: Compute aggregations using fast path
+	aggResults, err := computer.ComputeFastPathAggregations(ctx, aggregations, dataSources, partitions)
+	if err != nil {
+		return nil, false
+	}
+
+	// Step 4: Build final query result
+	columns := make([]string, len(aggregations))
+	row := make([]sqltypes.Value, len(aggregations))
+
+	for i, spec := range aggregations {
+		columns[i] = spec.Alias
+		row[i] = e.formatAggregationResult(spec, aggResults[i])
+	}
+
+	result := &QueryResult{
+		Columns: columns,
+		Rows:    [][]sqltypes.Value{row},
+	}
+
+	return result, true
+}
+
+// computeAggregations computes aggregation results from a full table scan
+func (e *SQLEngine) computeAggregations(results []HybridScanResult, aggregations []AggregationSpec) []AggregationResult {
+	aggResults := make([]AggregationResult, len(aggregations))
+
+	for i, spec := range aggregations {
+		switch spec.Function {
+		case "COUNT":
+			if spec.Column == "*" {
+				aggResults[i].Count = int64(len(results))
+			} else {
+				count := int64(0)
+				for _, result := range results {
+					if value := e.findColumnValue(result, spec.Column); value != nil && !e.isNullValue(value) {
+						count++
+					}
+				}
+				aggResults[i].Count = count
+			}
+
+		case "SUM":
+			sum := float64(0)
+			for _, result := range results {
+				if value := e.findColumnValue(result, spec.Column); value != nil {
+					if numValue := e.convertToNumber(value); numValue != nil {
+						sum += *numValue
+					}
+				}
+			}
+			aggResults[i].Sum = sum
+
+		case "AVG":
+			sum := float64(0)
+			count := int64(0)
+			for _, result := range results {
+				if value := e.findColumnValue(result, spec.Column); value != nil {
+					if numValue := e.convertToNumber(value); numValue != nil {
+						sum += *numValue
+						count++
+					}
+				}
+			}
+			if count > 0 {
+				aggResults[i].Sum = sum / float64(count) // Store average in Sum field
+				aggResults[i].Count = count
+			}
+
+		case "MIN":
+			var min interface{}
+			var minValue *schema_pb.Value
+			for _, result := range results {
+				if value := e.findColumnValue(result, spec.Column); value != nil {
+					if minValue == nil || e.compareValues(value, minValue) < 0 {
+						minValue = value
+						min = e.extractRawValue(value)
+					}
+				}
+			}
+			aggResults[i].Min = min
+
+		case "MAX":
+			var max interface{}
+			var maxValue *schema_pb.Value
+			for _, result := range results {
+				if value := e.findColumnValue(result, spec.Column); value != nil {
+					if maxValue == nil || e.compareValues(value, maxValue) > 0 {
+						maxValue = value
+						max = e.extractRawValue(value)
+					}
+				}
+			}
+			aggResults[i].Max = max
+		}
+	}
+
+	return aggResults
+}
+
+// canUseParquetStatsForAggregation determines if an aggregation can be optimized with parquet stats
+func (e *SQLEngine) canUseParquetStatsForAggregation(spec AggregationSpec) bool {
+	switch spec.Function {
+	case "COUNT":
+		return spec.Column == "*" || e.isSystemColumn(spec.Column) || e.isRegularColumn(spec.Column)
+	case "MIN", "MAX":
+		return e.isSystemColumn(spec.Column) || e.isRegularColumn(spec.Column)
+	case "SUM", "AVG":
+		// These require scanning actual values, not just min/max
+		return false
+	default:
+		return false
+	}
+}
--- a/weed/query/engine/data_conversion.go
+++ b/weed/query/engine/data_conversion.go
@ -0,0 +1,217 @@
+package engine
+
+import (
+	"fmt"
+
+	"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
+	"github.com/seaweedfs/seaweedfs/weed/query/sqltypes"
+)
+
+// formatAggregationResult formats an aggregation result into a SQL value
+func (e *SQLEngine) formatAggregationResult(spec AggregationSpec, result AggregationResult) sqltypes.Value {
+	switch spec.Function {
+	case "COUNT":
+		return sqltypes.NewInt64(result.Count)
+	case "SUM":
+		return sqltypes.NewFloat64(result.Sum)
+	case "AVG":
+		return sqltypes.NewFloat64(result.Sum) // Sum contains the average for AVG
+	case "MIN":
+		if result.Min != nil {
+			return e.convertRawValueToSQL(result.Min)
+		}
+		return sqltypes.NULL
+	case "MAX":
+		if result.Max != nil {
+			return e.convertRawValueToSQL(result.Max)
+		}
+		return sqltypes.NULL
+	}
+	return sqltypes.NULL
+}
+
+// convertRawValueToSQL converts a raw Go value to a SQL value
+func (e *SQLEngine) convertRawValueToSQL(value interface{}) sqltypes.Value {
+	switch v := value.(type) {
+	case int32:
+		return sqltypes.NewInt32(v)
+	case int64:
+		return sqltypes.NewInt64(v)
+	case float32:
+		return sqltypes.NewFloat32(v)
+	case float64:
+		return sqltypes.NewFloat64(v)
+	case string:
+		return sqltypes.NewVarChar(v)
+	case bool:
+		if v {
+			return sqltypes.NewVarChar("1")
+		}
+		return sqltypes.NewVarChar("0")
+	}
+	return sqltypes.NULL
+}
+
+// extractRawValue extracts the raw Go value from a schema_pb.Value
+func (e *SQLEngine) extractRawValue(value *schema_pb.Value) interface{} {
+	switch v := value.Kind.(type) {
+	case *schema_pb.Value_Int32Value:
+		return v.Int32Value
+	case *schema_pb.Value_Int64Value:
+		return v.Int64Value
+	case *schema_pb.Value_FloatValue:
+		return v.FloatValue
+	case *schema_pb.Value_DoubleValue:
+		return v.DoubleValue
+	case *schema_pb.Value_StringValue:
+		return v.StringValue
+	case *schema_pb.Value_BoolValue:
+		return v.BoolValue
+	case *schema_pb.Value_BytesValue:
+		return string(v.BytesValue) // Convert bytes to string for comparison
+	}
+	return nil
+}
+
+// compareValues compares two schema_pb.Value objects
+func (e *SQLEngine) compareValues(value1 *schema_pb.Value, value2 *schema_pb.Value) int {
+	if value2 == nil {
+		return 1 // value1 > nil
+	}
+	raw1 := e.extractRawValue(value1)
+	raw2 := e.extractRawValue(value2)
+	if raw1 == nil {
+		return -1
+	}
+	if raw2 == nil {
+		return 1
+	}
+
+	// Simple comparison - in a full implementation this would handle type coercion
+	switch v1 := raw1.(type) {
+	case int32:
+		if v2, ok := raw2.(int32); ok {
+			if v1 < v2 {
+				return -1
+			} else if v1 > v2 {
+				return 1
+			}
+			return 0
+		}
+	case int64:
+		if v2, ok := raw2.(int64); ok {
+			if v1 < v2 {
+				return -1
+			} else if v1 > v2 {
+				return 1
+			}
+			return 0
+		}
+	case float32:
+		if v2, ok := raw2.(float32); ok {
+			if v1 < v2 {
+				return -1
+			} else if v1 > v2 {
+				return 1
+			}
+			return 0
+		}
+	case float64:
+		if v2, ok := raw2.(float64); ok {
+			if v1 < v2 {
+				return -1
+			} else if v1 > v2 {
+				return 1
+			}
+			return 0
+		}
+	case string:
+		if v2, ok := raw2.(string); ok {
+			if v1 < v2 {
+				return -1
+			} else if v1 > v2 {
+				return 1
+			}
+			return 0
+		}
+	case bool:
+		if v2, ok := raw2.(bool); ok {
+			if v1 == v2 {
+				return 0
+			} else if v1 && !v2 {
+				return 1
+			}
+			return -1
+		}
+	}
+	return 0
+}
+
+// convertRawValueToSchemaValue converts raw Go values back to schema_pb.Value for comparison
+func (e *SQLEngine) convertRawValueToSchemaValue(rawValue interface{}) *schema_pb.Value {
+	switch v := rawValue.(type) {
+	case int32:
+		return &schema_pb.Value{Kind: &schema_pb.Value_Int32Value{Int32Value: v}}
+	case int64:
+		return &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: v}}
+	case float32:
+		return &schema_pb.Value{Kind: &schema_pb.Value_FloatValue{FloatValue: v}}
+	case float64:
+		return &schema_pb.Value{Kind: &schema_pb.Value_DoubleValue{DoubleValue: v}}
+	case string:
+		return &schema_pb.Value{Kind: &schema_pb.Value_StringValue{StringValue: v}}
+	case bool:
+		return &schema_pb.Value{Kind: &schema_pb.Value_BoolValue{BoolValue: v}}
+	case []byte:
+		return &schema_pb.Value{Kind: &schema_pb.Value_BytesValue{BytesValue: v}}
+	default:
+		// Convert other types to string as fallback
+		return &schema_pb.Value{Kind: &schema_pb.Value_StringValue{StringValue: fmt.Sprintf("%v", v)}}
+	}
+}
+
+// convertJSONValueToSchemaValue converts JSON values to schema_pb.Value
+func (e *SQLEngine) convertJSONValueToSchemaValue(jsonValue interface{}) *schema_pb.Value {
+	switch v := jsonValue.(type) {
+	case string:
+		return &schema_pb.Value{Kind: &schema_pb.Value_StringValue{StringValue: v}}
+	case float64:
+		// JSON numbers are always float64, try to detect if it's actually an integer
+		if v == float64(int64(v)) {
+			return &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: int64(v)}}
+		}
+		return &schema_pb.Value{Kind: &schema_pb.Value_DoubleValue{DoubleValue: v}}
+	case bool:
+		return &schema_pb.Value{Kind: &schema_pb.Value_BoolValue{BoolValue: v}}
+	case nil:
+		return nil
+	default:
+		// Convert other types to string
+		return &schema_pb.Value{Kind: &schema_pb.Value_StringValue{StringValue: fmt.Sprintf("%v", v)}}
+	}
+}
+
+// Helper functions for aggregation processing
+
+// isNullValue checks if a schema_pb.Value is null or empty
+func (e *SQLEngine) isNullValue(value *schema_pb.Value) bool {
+	return value == nil || value.Kind == nil
+}
+
+// convertToNumber converts a schema_pb.Value to a float64 for numeric operations
+func (e *SQLEngine) convertToNumber(value *schema_pb.Value) *float64 {
+	switch v := value.Kind.(type) {
+	case *schema_pb.Value_Int32Value:
+		result := float64(v.Int32Value)
+		return &result
+	case *schema_pb.Value_Int64Value:
+		result := float64(v.Int64Value)
+		return &result
+	case *schema_pb.Value_FloatValue:
+		result := float64(v.FloatValue)
+		return &result
+	case *schema_pb.Value_DoubleValue:
+		return &v.DoubleValue
+	}
+	return nil
+}
--- a/weed/query/engine/engine.go
+++ b/weed/query/engine/engine.go
@ -31,30 +31,6 @@ type SQLEngine struct {
 	catalog *SchemaCatalog
 }

-// QueryExecutionPlan contains information about how a query was executed
-type QueryExecutionPlan struct {
-	QueryType           string                 `json:"query_type"`         // SELECT, INSERT, etc.
-	ExecutionStrategy   string                 `json:"execution_strategy"` // fast_path, full_scan, hybrid
-	DataSources         []string               `json:"data_sources"`       // parquet_files, live_logs
-	PartitionsScanned   int                    `json:"partitions_scanned"`
-	ParquetFilesScanned int                    `json:"parquet_files_scanned"`
-	LiveLogFilesScanned int                    `json:"live_log_files_scanned"`
-	TotalRowsProcessed  int64                  `json:"total_rows_processed"`
-	OptimizationsUsed   []string               `json:"optimizations_used"` // parquet_stats, predicate_pushdown, etc.
-	TimeRangeFilters    map[string]interface{} `json:"time_range_filters,omitempty"`
-	Aggregations        []string               `json:"aggregations,omitempty"`
-	ExecutionTimeMs     float64                `json:"execution_time_ms"`
-	Details             map[string]interface{} `json:"details,omitempty"`
-}
-
-// QueryResult represents the result of a SQL query execution
-type QueryResult struct {
-	Columns       []string            `json:"columns"`
-	Rows          [][]sqltypes.Value  `json:"rows"`
-	Error         error               `json:"error,omitempty"`
-	ExecutionPlan *QueryExecutionPlan `json:"execution_plan,omitempty"`
-}
-
 // NewSQLEngine creates a new SQL execution engine
 // Uses master address for service discovery and initialization
 func NewSQLEngine(masterAddress string) *SQLEngine {
@ -1515,152 +1491,6 @@ func (e *SQLEngine) dropTable(ctx context.Context, stmt *sqlparser.DDL) (*QueryR
 	return result, nil
 }

-// AggregationSpec defines an aggregation function to be computed
-type AggregationSpec struct {
-	Function string // COUNT, SUM, AVG, MIN, MAX
-	Column   string // Column name, or "*" for COUNT(*)
-	Alias    string // Optional alias for the result column
-	Distinct bool   // Support for DISTINCT keyword
-}
-
-// AggregationResult holds the computed result of an aggregation
-type AggregationResult struct {
-	Count int64
-	Sum   float64
-	Min   interface{}
-	Max   interface{}
-}
-
-// AggregationStrategy represents the strategy for executing aggregations
-type AggregationStrategy struct {
-	CanUseFastPath   bool
-	Reason           string
-	UnsupportedSpecs []AggregationSpec
-}
-
-// TopicDataSources represents the data sources available for a topic
-type TopicDataSources struct {
-	ParquetFiles    map[string][]*ParquetFileStats // partitionPath -> parquet file stats
-	ParquetRowCount int64
-	LiveLogRowCount int64
-	PartitionsCount int
-}
-
-// FastPathOptimizer handles fast path aggregation optimization decisions
-type FastPathOptimizer struct {
-	engine *SQLEngine
-}
-
-// Error types for better error handling and testing
-type AggregationError struct {
-	Operation string
-	Column    string
-	Cause     error
-}
-
-func (e AggregationError) Error() string {
-	return fmt.Sprintf("aggregation error in %s(%s): %v", e.Operation, e.Column, e.Cause)
-}
-
-type DataSourceError struct {
-	Source string
-	Cause  error
-}
-
-func (e DataSourceError) Error() string {
-	return fmt.Sprintf("data source error in %s: %v", e.Source, e.Cause)
-}
-
-type OptimizationError struct {
-	Strategy string
-	Reason   string
-}
-
-func (e OptimizationError) Error() string {
-	return fmt.Sprintf("optimization failed for %s: %s", e.Strategy, e.Reason)
-}
-
-// NewFastPathOptimizer creates a new fast path optimizer
-func NewFastPathOptimizer(engine *SQLEngine) *FastPathOptimizer {
-	return &FastPathOptimizer{engine: engine}
-}
-
-// DetermineStrategy analyzes aggregations and determines if fast path can be used
-func (opt *FastPathOptimizer) DetermineStrategy(aggregations []AggregationSpec) AggregationStrategy {
-	strategy := AggregationStrategy{
-		CanUseFastPath:   true,
-		Reason:           "all_aggregations_supported",
-		UnsupportedSpecs: []AggregationSpec{},
-	}
-
-	for _, spec := range aggregations {
-		if !opt.engine.canUseParquetStatsForAggregation(spec) {
-			strategy.CanUseFastPath = false
-			strategy.Reason = "unsupported_aggregation_functions"
-			strategy.UnsupportedSpecs = append(strategy.UnsupportedSpecs, spec)
-		}
-	}
-
-	return strategy
-}
-
-// CollectDataSources gathers information about available data sources for a topic
-func (opt *FastPathOptimizer) CollectDataSources(ctx context.Context, hybridScanner *HybridMessageScanner) (*TopicDataSources, error) {
-	// Get all partitions for this topic
-	relativePartitions, err := opt.engine.discoverTopicPartitions(hybridScanner.topic.Namespace, hybridScanner.topic.Name)
-	if err != nil {
-		return nil, DataSourceError{
-			Source: fmt.Sprintf("partition_discovery:%s.%s", hybridScanner.topic.Namespace, hybridScanner.topic.Name),
-			Cause:  err,
-		}
-	}
-
-	// Convert relative partition paths to full paths
-	topicBasePath := fmt.Sprintf("/topics/%s/%s", hybridScanner.topic.Namespace, hybridScanner.topic.Name)
-	partitions := make([]string, len(relativePartitions))
-	for i, relPartition := range relativePartitions {
-		partitions[i] = fmt.Sprintf("%s/%s", topicBasePath, relPartition)
-	}
-
-	// Collect statistics from all partitions
-	dataSources := &TopicDataSources{
-		ParquetFiles:    make(map[string][]*ParquetFileStats),
-		ParquetRowCount: 0,
-		LiveLogRowCount: 0,
-		PartitionsCount: len(partitions),
-	}
-
-	for _, partition := range partitions {
-		partitionPath := partition
-
-		// Get parquet file statistics
-		fileStats, err := hybridScanner.ReadParquetStatistics(partitionPath)
-		if err != nil {
-			fileStats = []*ParquetFileStats{} // Empty stats, but continue
-		}
-
-		if len(fileStats) > 0 {
-			dataSources.ParquetFiles[partitionPath] = fileStats
-			for _, fileStat := range fileStats {
-				dataSources.ParquetRowCount += fileStat.RowCount
-			}
-		}
-
-		// Get parquet source files for deduplication
-		parquetSourceFiles := opt.engine.extractParquetSourceFiles(fileStats)
-
-		// Count live log rows (excluding parquet-converted files)
-		liveLogRowCount, err := opt.engine.countLiveLogRowsExcludingParquetSources(partitionPath, parquetSourceFiles)
-		if err != nil {
-			liveLogRowCount = 0 // No live logs is acceptable
-		}
-
-		dataSources.LiveLogRowCount += liveLogRowCount
-	}
-
-	return dataSources, nil
-}
-
 // AggregationComputer handles the computation of aggregations using fast path
 type AggregationComputer struct {
 	engine *SQLEngine
--- a/weed/query/engine/errors.go
+++ b/weed/query/engine/errors.go
@ -0,0 +1,36 @@
+package engine
+
+import "fmt"
+
+// Error types for better error handling and testing
+
+// AggregationError represents errors that occur during aggregation computation
+type AggregationError struct {
+	Operation string
+	Column    string
+	Cause     error
+}
+
+func (e AggregationError) Error() string {
+	return fmt.Sprintf("aggregation error in %s(%s): %v", e.Operation, e.Column, e.Cause)
+}
+
+// DataSourceError represents errors that occur when accessing data sources
+type DataSourceError struct {
+	Source string
+	Cause  error
+}
+
+func (e DataSourceError) Error() string {
+	return fmt.Sprintf("data source error in %s: %v", e.Source, e.Cause)
+}
+
+// OptimizationError represents errors that occur during query optimization
+type OptimizationError struct {
+	Strategy string
+	Reason   string
+}
+
+func (e OptimizationError) Error() string {
+	return fmt.Sprintf("optimization failed for %s: %s", e.Strategy, e.Reason)
+}
--- a/weed/query/engine/system_columns.go
+++ b/weed/query/engine/system_columns.go
@ -0,0 +1,170 @@
+package engine
+
+import (
+	"regexp"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
+)
+
+// isSystemColumn checks if a column is a system column (_timestamp_ns, _key, _source)
+func (e *SQLEngine) isSystemColumn(columnName string) bool {
+	lowerName := strings.ToLower(columnName)
+	return lowerName == "_timestamp_ns" || lowerName == "timestamp_ns" ||
+		lowerName == "_key" || lowerName == "key" ||
+		lowerName == "_source" || lowerName == "source"
+}
+
+// isRegularColumn checks if a column might be a regular data column (placeholder)
+func (e *SQLEngine) isRegularColumn(columnName string) bool {
+	// For now, assume any non-system column is a regular column
+	return !e.isSystemColumn(columnName)
+}
+
+// getSystemColumnGlobalMin computes global min for system columns using file metadata
+func (e *SQLEngine) getSystemColumnGlobalMin(columnName string, allFileStats map[string][]*ParquetFileStats) interface{} {
+	lowerName := strings.ToLower(columnName)
+
+	switch lowerName {
+	case "_timestamp_ns", "timestamp_ns":
+		// For timestamps, find the earliest timestamp across all files
+		// This should match what's in the Extended["min"] metadata
+		var minTimestamp *int64
+		for _, fileStats := range allFileStats {
+			for _, fileStat := range fileStats {
+				// Extract timestamp from filename (format: YYYY-MM-DD-HH-MM-SS.parquet)
+				timestamp := e.extractTimestampFromFilename(fileStat.FileName)
+				if timestamp != 0 {
+					if minTimestamp == nil || timestamp < *minTimestamp {
+						minTimestamp = &timestamp
+					}
+				}
+			}
+		}
+		if minTimestamp != nil {
+			return *minTimestamp
+		}
+
+	case "_key", "key":
+		// For keys, we'd need to read the actual parquet column stats
+		// Fall back to scanning if not available in our current stats
+		return nil
+
+	case "_source", "source":
+		// Source is always "parquet_archive" for parquet files
+		return "parquet_archive"
+	}
+
+	return nil
+}
+
+// getSystemColumnGlobalMax computes global max for system columns using file metadata
+func (e *SQLEngine) getSystemColumnGlobalMax(columnName string, allFileStats map[string][]*ParquetFileStats) interface{} {
+	lowerName := strings.ToLower(columnName)
+
+	switch lowerName {
+	case "_timestamp_ns", "timestamp_ns":
+		// For timestamps, find the latest timestamp across all files
+		// This should match what's in the Extended["max"] metadata
+		var maxTimestamp *int64
+		for _, fileStats := range allFileStats {
+			for _, fileStat := range fileStats {
+				// Extract timestamp from filename (format: YYYY-MM-DD-HH-MM-SS.parquet)
+				timestamp := e.extractTimestampFromFilename(fileStat.FileName)
+				if timestamp != 0 {
+					if maxTimestamp == nil || timestamp > *maxTimestamp {
+						maxTimestamp = &timestamp
+					}
+				}
+			}
+		}
+		if maxTimestamp != nil {
+			return *maxTimestamp
+		}
+
+	case "_key", "key":
+		// For keys, we'd need to read the actual parquet column stats
+		// Fall back to scanning if not available in our current stats
+		return nil
+
+	case "_source", "source":
+		// Source is always "parquet_archive" for parquet files
+		return "parquet_archive"
+	}
+
+	return nil
+}
+
+// extractTimestampFromFilename extracts timestamp from parquet filename
+func (e *SQLEngine) extractTimestampFromFilename(filename string) int64 {
+	// Expected format: YYYY-MM-DD-HH-MM-SS.parquet or similar
+	// Try to parse timestamp from filename
+	re := regexp.MustCompile(`(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})`)
+	matches := re.FindStringSubmatch(filename)
+	if len(matches) > 1 {
+		timestampStr := matches[1]
+		// Convert to time and then to nanoseconds
+		t, err := time.Parse("2006-01-02-15-04-05", timestampStr)
+		if err == nil {
+			return t.UnixNano()
+		}
+	}
+
+	// Fallback: try to parse as unix timestamp if filename is numeric
+	if timestampStr := strings.TrimSuffix(filename, ".parquet"); timestampStr != filename {
+		if timestamp, err := strconv.ParseInt(timestampStr, 10, 64); err == nil {
+			// Assume it's already in nanoseconds
+			return timestamp
+		}
+	}
+
+	return 0
+}
+
+// findColumnValue performs case-insensitive lookup of column values
+// Now includes support for system columns stored in HybridScanResult
+func (e *SQLEngine) findColumnValue(result HybridScanResult, columnName string) *schema_pb.Value {
+	lowerName := strings.ToLower(columnName)
+
+	// Check system columns first
+	switch lowerName {
+	case "_timestamp_ns", "timestamp_ns":
+		return &schema_pb.Value{
+			Kind: &schema_pb.Value_Int64Value{Int64Value: result.Timestamp},
+		}
+	case "_key", "key":
+		return &schema_pb.Value{
+			Kind: &schema_pb.Value_BytesValue{BytesValue: result.Key},
+		}
+	case "_source", "source":
+		return &schema_pb.Value{
+			Kind: &schema_pb.Value_StringValue{StringValue: result.Source},
+		}
+	}
+
+	// Check regular columns in the record data
+	if result.RecordValue != nil {
+		recordValue, ok := result.RecordValue.Kind.(*schema_pb.Value_RecordValue)
+		if !ok {
+			return nil
+		}
+
+		if recordValue.RecordValue.Fields != nil {
+			// Try exact match first
+			if value, exists := recordValue.RecordValue.Fields[columnName]; exists {
+				return value
+			}
+
+			// Try case-insensitive match
+			for fieldName, value := range recordValue.RecordValue.Fields {
+				if strings.EqualFold(fieldName, columnName) {
+					return value
+				}
+			}
+		}
+	}
+
+	return nil
+}
--- a/weed/query/engine/types.go
+++ b/weed/query/engine/types.go
@ -0,0 +1,62 @@
+package engine
+
+import (
+	"github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
+	"github.com/seaweedfs/seaweedfs/weed/query/sqltypes"
+)
+
+// QueryExecutionPlan contains information about how a query was executed
+type QueryExecutionPlan struct {
+	QueryType           string
+	ExecutionStrategy   string                 `json:"execution_strategy"` // fast_path, full_scan, hybrid
+	DataSources         []string               `json:"data_sources"`       // parquet_files, live_logs
+	PartitionsScanned   int                    `json:"partitions_scanned"`
+	ParquetFilesScanned int                    `json:"parquet_files_scanned"`
+	LiveLogFilesScanned int                    `json:"live_log_files_scanned"`
+	TotalRowsProcessed  int64                  `json:"total_rows_processed"`
+	OptimizationsUsed   []string               `json:"optimizations_used"` // parquet_stats, predicate_pushdown, etc.
+	TimeRangeFilters    map[string]interface{} `json:"time_range_filters,omitempty"`
+	Aggregations        []string               `json:"aggregations,omitempty"`
+	ExecutionTimeMs     float64                `json:"execution_time_ms"`
+	Details             map[string]interface{} `json:"details,omitempty"`
+}
+
+// QueryResult represents the result of a SQL query execution
+type QueryResult struct {
+	Columns       []string            `json:"columns"`
+	Rows          [][]sqltypes.Value  `json:"rows"`
+	Error         error               `json:"error,omitempty"`
+	ExecutionPlan *QueryExecutionPlan `json:"execution_plan,omitempty"`
+}
+
+// ParquetColumnStats holds statistics for a single column in a Parquet file
+type ParquetColumnStats struct {
+	ColumnName string
+	MinValue   *schema_pb.Value
+	MaxValue   *schema_pb.Value
+	NullCount  int64
+	RowCount   int64
+}
+
+// ParquetFileStats holds statistics for a single Parquet file
+type ParquetFileStats struct {
+	FileName    string
+	RowCount    int64
+	ColumnStats map[string]*ParquetColumnStats
+}
+
+// HybridScanResult represents a single record from hybrid scanning
+type HybridScanResult struct {
+	RecordValue *schema_pb.Value
+	Source      string // "live_log", "parquet_archive"
+	Timestamp   int64
+	Key         []byte
+}
+
+// HybridScanOptions configures how the hybrid scanner operates
+type HybridScanOptions struct {
+	StartTimeNs int64
+	StopTimeNs  int64
+	Limit       int
+	Predicate   func(*schema_pb.RecordValue) bool
+}