package engine import ( "context" "encoding/binary" "errors" "testing" "github.com/seaweedfs/seaweedfs/weed/mq/topic" "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" "github.com/seaweedfs/seaweedfs/weed/pb/schema_pb" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "google.golang.org/protobuf/proto" ) // Mock implementations for testing type MockHybridMessageScanner struct { mock.Mock topic topic.Topic } func (m *MockHybridMessageScanner) ReadParquetStatistics(partitionPath string) ([]*ParquetFileStats, error) { args := m.Called(partitionPath) return args.Get(0).([]*ParquetFileStats), args.Error(1) } type MockSQLEngine struct { *SQLEngine mockPartitions map[string][]string mockParquetSourceFiles map[string]map[string]bool mockLiveLogRowCounts map[string]int64 mockColumnStats map[string]map[string]*ParquetColumnStats } func NewMockSQLEngine() *MockSQLEngine { return &MockSQLEngine{ SQLEngine: &SQLEngine{ catalog: &SchemaCatalog{ databases: make(map[string]*DatabaseInfo), currentDatabase: "test", }, }, mockPartitions: make(map[string][]string), mockParquetSourceFiles: make(map[string]map[string]bool), mockLiveLogRowCounts: make(map[string]int64), mockColumnStats: make(map[string]map[string]*ParquetColumnStats), } } func (m *MockSQLEngine) discoverTopicPartitions(namespace, topicName string) ([]string, error) { key := namespace + "." + topicName if partitions, exists := m.mockPartitions[key]; exists { return partitions, nil } return []string{"partition-1", "partition-2"}, nil } func (m *MockSQLEngine) extractParquetSourceFiles(fileStats []*ParquetFileStats) map[string]bool { if len(fileStats) == 0 { return make(map[string]bool) } return map[string]bool{"converted-log-1": true} } func (m *MockSQLEngine) countLiveLogRowsExcludingParquetSources(ctx context.Context, partition string, parquetSources map[string]bool) (int64, error) { if count, exists := m.mockLiveLogRowCounts[partition]; exists { return count, nil } return 25, nil } func (m *MockSQLEngine) computeLiveLogMinMax(partition, column string, parquetSources map[string]bool) (interface{}, interface{}, error) { switch column { case "id": return int64(1), int64(50), nil case "value": return 10.5, 99.9, nil default: return nil, nil, nil } } func (m *MockSQLEngine) getSystemColumnGlobalMin(column string, allFileStats map[string][]*ParquetFileStats) interface{} { return int64(1000000000) } func (m *MockSQLEngine) getSystemColumnGlobalMax(column string, allFileStats map[string][]*ParquetFileStats) interface{} { return int64(2000000000) } func createMockColumnStats(column string, minVal, maxVal interface{}) *ParquetColumnStats { return &ParquetColumnStats{ ColumnName: column, MinValue: convertToSchemaValue(minVal), MaxValue: convertToSchemaValue(maxVal), NullCount: 0, } } func convertToSchemaValue(val interface{}) *schema_pb.Value { switch v := val.(type) { case int64: return &schema_pb.Value{Kind: &schema_pb.Value_Int64Value{Int64Value: v}} case float64: return &schema_pb.Value{Kind: &schema_pb.Value_DoubleValue{DoubleValue: v}} case string: return &schema_pb.Value{Kind: &schema_pb.Value_StringValue{StringValue: v}} } return nil } // Test FastPathOptimizer func TestFastPathOptimizer_DetermineStrategy(t *testing.T) { engine := NewMockSQLEngine() optimizer := NewFastPathOptimizer(engine.SQLEngine) tests := []struct { name string aggregations []AggregationSpec expected AggregationStrategy }{ { name: "Supported aggregations", aggregations: []AggregationSpec{ {Function: FuncCOUNT, Column: "*"}, {Function: FuncMAX, Column: "id"}, {Function: FuncMIN, Column: "value"}, }, expected: AggregationStrategy{ CanUseFastPath: true, Reason: "all_aggregations_supported", UnsupportedSpecs: []AggregationSpec{}, }, }, { name: "Unsupported aggregation", aggregations: []AggregationSpec{ {Function: FuncCOUNT, Column: "*"}, {Function: FuncAVG, Column: "value"}, // Not supported }, expected: AggregationStrategy{ CanUseFastPath: false, Reason: "unsupported_aggregation_functions", }, }, { name: "Empty aggregations", aggregations: []AggregationSpec{}, expected: AggregationStrategy{ CanUseFastPath: true, Reason: "all_aggregations_supported", UnsupportedSpecs: []AggregationSpec{}, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { strategy := optimizer.DetermineStrategy(tt.aggregations) assert.Equal(t, tt.expected.CanUseFastPath, strategy.CanUseFastPath) assert.Equal(t, tt.expected.Reason, strategy.Reason) if !tt.expected.CanUseFastPath { assert.NotEmpty(t, strategy.UnsupportedSpecs) } }) } } // Test AggregationComputer func TestAggregationComputer_ComputeFastPathAggregations(t *testing.T) { engine := NewMockSQLEngine() computer := NewAggregationComputer(engine.SQLEngine) dataSources := &TopicDataSources{ ParquetFiles: map[string][]*ParquetFileStats{ "/topics/test/topic1/partition-1": { { RowCount: 30, ColumnStats: map[string]*ParquetColumnStats{ "id": createMockColumnStats("id", int64(10), int64(40)), }, }, }, }, ParquetRowCount: 30, LiveLogRowCount: 25, PartitionsCount: 1, } partitions := []string{"/topics/test/topic1/partition-1"} tests := []struct { name string aggregations []AggregationSpec validate func(t *testing.T, results []AggregationResult) }{ { name: "COUNT aggregation", aggregations: []AggregationSpec{ {Function: FuncCOUNT, Column: "*"}, }, validate: func(t *testing.T, results []AggregationResult) { assert.Len(t, results, 1) assert.Equal(t, int64(55), results[0].Count) // 30 + 25 }, }, { name: "MAX aggregation", aggregations: []AggregationSpec{ {Function: FuncMAX, Column: "id"}, }, validate: func(t *testing.T, results []AggregationResult) { assert.Len(t, results, 1) // Should be max of parquet stats (40) - mock doesn't combine with live log assert.Equal(t, int64(40), results[0].Max) }, }, { name: "MIN aggregation", aggregations: []AggregationSpec{ {Function: FuncMIN, Column: "id"}, }, validate: func(t *testing.T, results []AggregationResult) { assert.Len(t, results, 1) // Should be min of parquet stats (10) - mock doesn't combine with live log assert.Equal(t, int64(10), results[0].Min) }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { ctx := context.Background() results, err := computer.ComputeFastPathAggregations(ctx, tt.aggregations, dataSources, partitions) assert.NoError(t, err) tt.validate(t, results) }) } } // Test case-insensitive column lookup and null handling for MIN/MAX aggregations func TestAggregationComputer_MinMaxEdgeCases(t *testing.T) { engine := NewMockSQLEngine() computer := NewAggregationComputer(engine.SQLEngine) tests := []struct { name string dataSources *TopicDataSources aggregations []AggregationSpec validate func(t *testing.T, results []AggregationResult, err error) }{ { name: "Case insensitive column lookup", dataSources: &TopicDataSources{ ParquetFiles: map[string][]*ParquetFileStats{ "/topics/test/partition-1": { { RowCount: 50, ColumnStats: map[string]*ParquetColumnStats{ "ID": createMockColumnStats("ID", int64(5), int64(95)), // Uppercase column name }, }, }, }, ParquetRowCount: 50, LiveLogRowCount: 0, PartitionsCount: 1, }, aggregations: []AggregationSpec{ {Function: FuncMIN, Column: "id"}, // lowercase column name {Function: FuncMAX, Column: "id"}, }, validate: func(t *testing.T, results []AggregationResult, err error) { assert.NoError(t, err) assert.Len(t, results, 2) assert.Equal(t, int64(5), results[0].Min, "MIN should work with case-insensitive lookup") assert.Equal(t, int64(95), results[1].Max, "MAX should work with case-insensitive lookup") }, }, { name: "Null column stats handling", dataSources: &TopicDataSources{ ParquetFiles: map[string][]*ParquetFileStats{ "/topics/test/partition-1": { { RowCount: 50, ColumnStats: map[string]*ParquetColumnStats{ "id": { ColumnName: "id", MinValue: nil, // Null min value MaxValue: nil, // Null max value NullCount: 50, RowCount: 50, }, }, }, }, }, ParquetRowCount: 50, LiveLogRowCount: 0, PartitionsCount: 1, }, aggregations: []AggregationSpec{ {Function: FuncMIN, Column: "id"}, {Function: FuncMAX, Column: "id"}, }, validate: func(t *testing.T, results []AggregationResult, err error) { assert.NoError(t, err) assert.Len(t, results, 2) // When stats are null, should fall back to system column or return nil // This tests that we don't crash on null stats }, }, { name: "Mixed data types - string column", dataSources: &TopicDataSources{ ParquetFiles: map[string][]*ParquetFileStats{ "/topics/test/partition-1": { { RowCount: 30, ColumnStats: map[string]*ParquetColumnStats{ "name": createMockColumnStats("name", "Alice", "Zoe"), }, }, }, }, ParquetRowCount: 30, LiveLogRowCount: 0, PartitionsCount: 1, }, aggregations: []AggregationSpec{ {Function: FuncMIN, Column: "name"}, {Function: FuncMAX, Column: "name"}, }, validate: func(t *testing.T, results []AggregationResult, err error) { assert.NoError(t, err) assert.Len(t, results, 2) assert.Equal(t, "Alice", results[0].Min) assert.Equal(t, "Zoe", results[1].Max) }, }, { name: "Mixed data types - float column", dataSources: &TopicDataSources{ ParquetFiles: map[string][]*ParquetFileStats{ "/topics/test/partition-1": { { RowCount: 25, ColumnStats: map[string]*ParquetColumnStats{ "price": createMockColumnStats("price", float64(19.99), float64(299.50)), }, }, }, }, ParquetRowCount: 25, LiveLogRowCount: 0, PartitionsCount: 1, }, aggregations: []AggregationSpec{ {Function: FuncMIN, Column: "price"}, {Function: FuncMAX, Column: "price"}, }, validate: func(t *testing.T, results []AggregationResult, err error) { assert.NoError(t, err) assert.Len(t, results, 2) assert.Equal(t, float64(19.99), results[0].Min) assert.Equal(t, float64(299.50), results[1].Max) }, }, { name: "Column not found in parquet stats", dataSources: &TopicDataSources{ ParquetFiles: map[string][]*ParquetFileStats{ "/topics/test/partition-1": { { RowCount: 20, ColumnStats: map[string]*ParquetColumnStats{ "id": createMockColumnStats("id", int64(1), int64(100)), // Note: "nonexistent_column" is not in stats }, }, }, }, ParquetRowCount: 20, LiveLogRowCount: 10, // Has live logs to fall back to PartitionsCount: 1, }, aggregations: []AggregationSpec{ {Function: FuncMIN, Column: "nonexistent_column"}, {Function: FuncMAX, Column: "nonexistent_column"}, }, validate: func(t *testing.T, results []AggregationResult, err error) { assert.NoError(t, err) assert.Len(t, results, 2) // Should fall back to live log processing or return nil // The key is that it shouldn't crash }, }, { name: "Multiple parquet files with different ranges", dataSources: &TopicDataSources{ ParquetFiles: map[string][]*ParquetFileStats{ "/topics/test/partition-1": { { RowCount: 30, ColumnStats: map[string]*ParquetColumnStats{ "score": createMockColumnStats("score", int64(10), int64(50)), }, }, { RowCount: 40, ColumnStats: map[string]*ParquetColumnStats{ "score": createMockColumnStats("score", int64(5), int64(75)), // Lower min, higher max }, }, }, }, ParquetRowCount: 70, LiveLogRowCount: 0, PartitionsCount: 1, }, aggregations: []AggregationSpec{ {Function: FuncMIN, Column: "score"}, {Function: FuncMAX, Column: "score"}, }, validate: func(t *testing.T, results []AggregationResult, err error) { assert.NoError(t, err) assert.Len(t, results, 2) assert.Equal(t, int64(5), results[0].Min, "Should find global minimum across all files") assert.Equal(t, int64(75), results[1].Max, "Should find global maximum across all files") }, }, } partitions := []string{"/topics/test/partition-1"} for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { ctx := context.Background() results, err := computer.ComputeFastPathAggregations(ctx, tt.aggregations, tt.dataSources, partitions) tt.validate(t, results, err) }) } } // Test the specific bug where MIN/MAX was returning empty values func TestAggregationComputer_MinMaxEmptyValuesBugFix(t *testing.T) { engine := NewMockSQLEngine() computer := NewAggregationComputer(engine.SQLEngine) // This test specifically addresses the bug where MIN/MAX returned empty // due to improper null checking and extraction logic dataSources := &TopicDataSources{ ParquetFiles: map[string][]*ParquetFileStats{ "/topics/test/test-topic/partition1": { { RowCount: 100, ColumnStats: map[string]*ParquetColumnStats{ "id": { ColumnName: "id", MinValue: &schema_pb.Value{Kind: &schema_pb.Value_Int32Value{Int32Value: 0}}, // Min should be 0 MaxValue: &schema_pb.Value{Kind: &schema_pb.Value_Int32Value{Int32Value: 99}}, // Max should be 99 NullCount: 0, RowCount: 100, }, }, }, }, }, ParquetRowCount: 100, LiveLogRowCount: 0, // No live logs, pure parquet stats PartitionsCount: 1, } partitions := []string{"/topics/test/test-topic/partition1"} tests := []struct { name string aggregSpec AggregationSpec expected interface{} }{ { name: "MIN should return 0 not empty", aggregSpec: AggregationSpec{Function: FuncMIN, Column: "id"}, expected: int32(0), // Should extract the actual minimum value }, { name: "MAX should return 99 not empty", aggregSpec: AggregationSpec{Function: FuncMAX, Column: "id"}, expected: int32(99), // Should extract the actual maximum value }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { ctx := context.Background() results, err := computer.ComputeFastPathAggregations(ctx, []AggregationSpec{tt.aggregSpec}, dataSources, partitions) assert.NoError(t, err) assert.Len(t, results, 1) // Verify the result is not nil/empty if tt.aggregSpec.Function == FuncMIN { assert.NotNil(t, results[0].Min, "MIN result should not be nil") assert.Equal(t, tt.expected, results[0].Min) } else if tt.aggregSpec.Function == FuncMAX { assert.NotNil(t, results[0].Max, "MAX result should not be nil") assert.Equal(t, tt.expected, results[0].Max) } }) } } // Test the formatAggregationResult function with MIN/MAX edge cases func TestSQLEngine_FormatAggregationResult_MinMax(t *testing.T) { engine := NewTestSQLEngine() tests := []struct { name string spec AggregationSpec result AggregationResult expected string }{ { name: "MIN with zero value should not be empty", spec: AggregationSpec{Function: FuncMIN, Column: "id"}, result: AggregationResult{Min: int32(0)}, expected: "0", }, { name: "MAX with large value", spec: AggregationSpec{Function: FuncMAX, Column: "id"}, result: AggregationResult{Max: int32(99)}, expected: "99", }, { name: "MIN with negative value", spec: AggregationSpec{Function: FuncMIN, Column: "score"}, result: AggregationResult{Min: int64(-50)}, expected: "-50", }, { name: "MAX with float value", spec: AggregationSpec{Function: FuncMAX, Column: "price"}, result: AggregationResult{Max: float64(299.99)}, expected: "299.99", }, { name: "MIN with string value", spec: AggregationSpec{Function: FuncMIN, Column: "name"}, result: AggregationResult{Min: "Alice"}, expected: "Alice", }, { name: "MIN with nil should return NULL", spec: AggregationSpec{Function: FuncMIN, Column: "missing"}, result: AggregationResult{Min: nil}, expected: "", // NULL values display as empty }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { sqlValue := engine.formatAggregationResult(tt.spec, tt.result) assert.Equal(t, tt.expected, sqlValue.String()) }) } } // Test the direct formatAggregationResult scenario that was originally broken func TestSQLEngine_MinMaxBugFixIntegration(t *testing.T) { // This test focuses on the core bug fix without the complexity of table discovery // It directly tests the scenario where MIN/MAX returned empty due to the bug engine := NewTestSQLEngine() // Test the direct formatting path that was failing tests := []struct { name string aggregSpec AggregationSpec aggResult AggregationResult expectedEmpty bool expectedValue string }{ { name: "MIN with zero should not be empty (the original bug)", aggregSpec: AggregationSpec{Function: FuncMIN, Column: "id", Alias: "MIN(id)"}, aggResult: AggregationResult{Min: int32(0)}, // This was returning empty before fix expectedEmpty: false, expectedValue: "0", }, { name: "MAX with valid value should not be empty", aggregSpec: AggregationSpec{Function: FuncMAX, Column: "id", Alias: "MAX(id)"}, aggResult: AggregationResult{Max: int32(99)}, expectedEmpty: false, expectedValue: "99", }, { name: "MIN with negative value should work", aggregSpec: AggregationSpec{Function: FuncMIN, Column: "score", Alias: "MIN(score)"}, aggResult: AggregationResult{Min: int64(-10)}, expectedEmpty: false, expectedValue: "-10", }, { name: "MIN with nil should be empty (expected behavior)", aggregSpec: AggregationSpec{Function: FuncMIN, Column: "missing", Alias: "MIN(missing)"}, aggResult: AggregationResult{Min: nil}, expectedEmpty: true, expectedValue: "", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { // Test the formatAggregationResult function directly sqlValue := engine.formatAggregationResult(tt.aggregSpec, tt.aggResult) result := sqlValue.String() if tt.expectedEmpty { assert.Empty(t, result, "Result should be empty for nil values") } else { assert.NotEmpty(t, result, "Result should not be empty") assert.Equal(t, tt.expectedValue, result) } }) } } // Test the tryFastParquetAggregation method specifically for the bug func TestSQLEngine_FastParquetAggregationBugFix(t *testing.T) { // This test verifies that the fast path aggregation logic works correctly // and doesn't return nil/empty values when it should return actual data engine := NewMockSQLEngine() computer := NewAggregationComputer(engine.SQLEngine) // Create realistic data sources that mimic the user's scenario dataSources := &TopicDataSources{ ParquetFiles: map[string][]*ParquetFileStats{ "/topics/test/test-topic/v2025-09-01-22-54-02/0000-0630": { { RowCount: 100, ColumnStats: map[string]*ParquetColumnStats{ "id": { ColumnName: "id", MinValue: &schema_pb.Value{Kind: &schema_pb.Value_Int32Value{Int32Value: 0}}, MaxValue: &schema_pb.Value{Kind: &schema_pb.Value_Int32Value{Int32Value: 99}}, NullCount: 0, RowCount: 100, }, }, }, }, }, ParquetRowCount: 100, LiveLogRowCount: 0, // Pure parquet scenario PartitionsCount: 1, } partitions := []string{"/topics/test/test-topic/v2025-09-01-22-54-02/0000-0630"} tests := []struct { name string aggregations []AggregationSpec validateResults func(t *testing.T, results []AggregationResult) }{ { name: "Single MIN aggregation should return value not nil", aggregations: []AggregationSpec{ {Function: FuncMIN, Column: "id", Alias: "MIN(id)"}, }, validateResults: func(t *testing.T, results []AggregationResult) { assert.Len(t, results, 1) assert.NotNil(t, results[0].Min, "MIN result should not be nil") assert.Equal(t, int32(0), results[0].Min, "MIN should return the correct minimum value") }, }, { name: "Single MAX aggregation should return value not nil", aggregations: []AggregationSpec{ {Function: FuncMAX, Column: "id", Alias: "MAX(id)"}, }, validateResults: func(t *testing.T, results []AggregationResult) { assert.Len(t, results, 1) assert.NotNil(t, results[0].Max, "MAX result should not be nil") assert.Equal(t, int32(99), results[0].Max, "MAX should return the correct maximum value") }, }, { name: "Combined MIN/MAX should both return values", aggregations: []AggregationSpec{ {Function: FuncMIN, Column: "id", Alias: "MIN(id)"}, {Function: FuncMAX, Column: "id", Alias: "MAX(id)"}, }, validateResults: func(t *testing.T, results []AggregationResult) { assert.Len(t, results, 2) assert.NotNil(t, results[0].Min, "MIN result should not be nil") assert.NotNil(t, results[1].Max, "MAX result should not be nil") assert.Equal(t, int32(0), results[0].Min) assert.Equal(t, int32(99), results[1].Max) }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { ctx := context.Background() results, err := computer.ComputeFastPathAggregations(ctx, tt.aggregations, dataSources, partitions) assert.NoError(t, err, "ComputeFastPathAggregations should not error") tt.validateResults(t, results) }) } } // Test ExecutionPlanBuilder func TestExecutionPlanBuilder_BuildAggregationPlan(t *testing.T) { engine := NewMockSQLEngine() builder := NewExecutionPlanBuilder(engine.SQLEngine) // Parse a simple SELECT statement using the native parser stmt, err := ParseSQL("SELECT COUNT(*) FROM test_topic") assert.NoError(t, err) selectStmt := stmt.(*SelectStatement) aggregations := []AggregationSpec{ {Function: FuncCOUNT, Column: "*"}, } strategy := AggregationStrategy{ CanUseFastPath: true, Reason: "all_aggregations_supported", } dataSources := &TopicDataSources{ ParquetRowCount: 100, LiveLogRowCount: 50, PartitionsCount: 3, ParquetFiles: map[string][]*ParquetFileStats{ "partition-1": {{RowCount: 50}}, "partition-2": {{RowCount: 50}}, }, } plan := builder.BuildAggregationPlan(selectStmt, aggregations, strategy, dataSources) assert.Equal(t, "SELECT", plan.QueryType) assert.Equal(t, "hybrid_fast_path", plan.ExecutionStrategy) assert.Contains(t, plan.DataSources, "parquet_stats") assert.Contains(t, plan.DataSources, "live_logs") assert.Equal(t, 3, plan.PartitionsScanned) assert.Equal(t, 2, plan.ParquetFilesScanned) assert.Contains(t, plan.OptimizationsUsed, "parquet_statistics") assert.Equal(t, []string{"COUNT(*)"}, plan.Aggregations) assert.Equal(t, int64(50), plan.TotalRowsProcessed) // Only live logs scanned } // Test Error Types func TestErrorTypes(t *testing.T) { t.Run("AggregationError", func(t *testing.T) { err := AggregationError{ Operation: "MAX", Column: "id", Cause: errors.New("column not found"), } expected := "aggregation error in MAX(id): column not found" assert.Equal(t, expected, err.Error()) }) t.Run("DataSourceError", func(t *testing.T) { err := DataSourceError{ Source: "partition_discovery:test.topic1", Cause: errors.New("network timeout"), } expected := "data source error in partition_discovery:test.topic1: network timeout" assert.Equal(t, expected, err.Error()) }) t.Run("OptimizationError", func(t *testing.T) { err := OptimizationError{ Strategy: "fast_path_aggregation", Reason: "unsupported function: AVG", } expected := "optimization failed for fast_path_aggregation: unsupported function: AVG" assert.Equal(t, expected, err.Error()) }) } // Integration Tests func TestIntegration_FastPathOptimization(t *testing.T) { engine := NewMockSQLEngine() // Setup components optimizer := NewFastPathOptimizer(engine.SQLEngine) computer := NewAggregationComputer(engine.SQLEngine) // Mock data setup aggregations := []AggregationSpec{ {Function: FuncCOUNT, Column: "*"}, {Function: FuncMAX, Column: "id"}, } // Step 1: Determine strategy strategy := optimizer.DetermineStrategy(aggregations) assert.True(t, strategy.CanUseFastPath) // Step 2: Mock data sources dataSources := &TopicDataSources{ ParquetFiles: map[string][]*ParquetFileStats{ "/topics/test/topic1/partition-1": {{ RowCount: 75, ColumnStats: map[string]*ParquetColumnStats{ "id": createMockColumnStats("id", int64(1), int64(100)), }, }}, }, ParquetRowCount: 75, LiveLogRowCount: 25, PartitionsCount: 1, } partitions := []string{"/topics/test/topic1/partition-1"} // Step 3: Compute aggregations ctx := context.Background() results, err := computer.ComputeFastPathAggregations(ctx, aggregations, dataSources, partitions) assert.NoError(t, err) assert.Len(t, results, 2) assert.Equal(t, int64(100), results[0].Count) // 75 + 25 assert.Equal(t, int64(100), results[1].Max) // From parquet stats mock } func TestIntegration_FallbackToFullScan(t *testing.T) { engine := NewMockSQLEngine() optimizer := NewFastPathOptimizer(engine.SQLEngine) // Unsupported aggregations aggregations := []AggregationSpec{ {Function: "AVG", Column: "value"}, // Not supported } // Step 1: Strategy should reject fast path strategy := optimizer.DetermineStrategy(aggregations) assert.False(t, strategy.CanUseFastPath) assert.Equal(t, "unsupported_aggregation_functions", strategy.Reason) assert.NotEmpty(t, strategy.UnsupportedSpecs) } // Benchmark Tests func BenchmarkFastPathOptimizer_DetermineStrategy(b *testing.B) { engine := NewMockSQLEngine() optimizer := NewFastPathOptimizer(engine.SQLEngine) aggregations := []AggregationSpec{ {Function: FuncCOUNT, Column: "*"}, {Function: FuncMAX, Column: "id"}, {Function: "MIN", Column: "value"}, } b.ResetTimer() for i := 0; i < b.N; i++ { strategy := optimizer.DetermineStrategy(aggregations) _ = strategy.CanUseFastPath } } func BenchmarkAggregationComputer_ComputeFastPathAggregations(b *testing.B) { engine := NewMockSQLEngine() computer := NewAggregationComputer(engine.SQLEngine) dataSources := &TopicDataSources{ ParquetFiles: map[string][]*ParquetFileStats{ "partition-1": {{ RowCount: 1000, ColumnStats: map[string]*ParquetColumnStats{ "id": createMockColumnStats("id", int64(1), int64(1000)), }, }}, }, ParquetRowCount: 1000, LiveLogRowCount: 100, } aggregations := []AggregationSpec{ {Function: FuncCOUNT, Column: "*"}, {Function: FuncMAX, Column: "id"}, } partitions := []string{"partition-1"} ctx := context.Background() b.ResetTimer() for i := 0; i < b.N; i++ { results, err := computer.ComputeFastPathAggregations(ctx, aggregations, dataSources, partitions) if err != nil { b.Fatal(err) } _ = results } } // Tests for convertLogEntryToRecordValue - Protocol Buffer parsing bug fix func TestSQLEngine_ConvertLogEntryToRecordValue_ValidProtobuf(t *testing.T) { engine := NewTestSQLEngine() // Create a valid RecordValue protobuf with user data originalRecord := &schema_pb.RecordValue{ Fields: map[string]*schema_pb.Value{ "id": {Kind: &schema_pb.Value_Int32Value{Int32Value: 42}}, "name": {Kind: &schema_pb.Value_StringValue{StringValue: "test-user"}}, "score": {Kind: &schema_pb.Value_DoubleValue{DoubleValue: 95.5}}, }, } // Serialize the protobuf (this is what MQ actually stores) protobufData, err := proto.Marshal(originalRecord) assert.NoError(t, err) // Create a LogEntry with the serialized data logEntry := &filer_pb.LogEntry{ TsNs: 1609459200000000000, // 2021-01-01 00:00:00 UTC PartitionKeyHash: 123, Data: protobufData, // Protocol buffer data (not JSON!) Key: []byte("test-key-001"), } // Test the conversion result, source, err := engine.convertLogEntryToRecordValue(logEntry) // Verify no error assert.NoError(t, err) assert.Equal(t, "live_log", source) assert.NotNil(t, result) assert.NotNil(t, result.Fields) // Verify system columns are added correctly assert.Contains(t, result.Fields, SW_COLUMN_NAME_TIMESTAMP) assert.Contains(t, result.Fields, SW_COLUMN_NAME_KEY) assert.Equal(t, int64(1609459200000000000), result.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()) assert.Equal(t, []byte("test-key-001"), result.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()) // Verify user data is preserved assert.Contains(t, result.Fields, "id") assert.Contains(t, result.Fields, "name") assert.Contains(t, result.Fields, "score") assert.Equal(t, int32(42), result.Fields["id"].GetInt32Value()) assert.Equal(t, "test-user", result.Fields["name"].GetStringValue()) assert.Equal(t, 95.5, result.Fields["score"].GetDoubleValue()) } func TestSQLEngine_ConvertLogEntryToRecordValue_InvalidProtobuf(t *testing.T) { engine := NewTestSQLEngine() // Create LogEntry with invalid protobuf data (this would cause the original JSON parsing bug) logEntry := &filer_pb.LogEntry{ TsNs: 1609459200000000000, PartitionKeyHash: 123, Data: []byte{0x17, 0x00, 0xFF, 0xFE}, // Invalid protobuf data (starts with \x17 like in the original error) Key: []byte("test-key"), } // Test the conversion result, source, err := engine.convertLogEntryToRecordValue(logEntry) // Should return error for invalid protobuf assert.Error(t, err) assert.Contains(t, err.Error(), "failed to unmarshal log entry protobuf") assert.Nil(t, result) assert.Empty(t, source) } func TestSQLEngine_ConvertLogEntryToRecordValue_EmptyProtobuf(t *testing.T) { engine := NewTestSQLEngine() // Create a minimal valid RecordValue (empty fields) emptyRecord := &schema_pb.RecordValue{ Fields: map[string]*schema_pb.Value{}, } protobufData, err := proto.Marshal(emptyRecord) assert.NoError(t, err) logEntry := &filer_pb.LogEntry{ TsNs: 1609459200000000000, PartitionKeyHash: 456, Data: protobufData, Key: []byte("empty-key"), } // Test the conversion result, source, err := engine.convertLogEntryToRecordValue(logEntry) // Should succeed and add system columns assert.NoError(t, err) assert.Equal(t, "live_log", source) assert.NotNil(t, result) assert.NotNil(t, result.Fields) // Should have system columns assert.Contains(t, result.Fields, SW_COLUMN_NAME_TIMESTAMP) assert.Contains(t, result.Fields, SW_COLUMN_NAME_KEY) assert.Equal(t, int64(1609459200000000000), result.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()) assert.Equal(t, []byte("empty-key"), result.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()) // Should have no user fields userFieldCount := 0 for fieldName := range result.Fields { if fieldName != SW_COLUMN_NAME_TIMESTAMP && fieldName != SW_COLUMN_NAME_KEY { userFieldCount++ } } assert.Equal(t, 0, userFieldCount) } func TestSQLEngine_ConvertLogEntryToRecordValue_NilFieldsMap(t *testing.T) { engine := NewTestSQLEngine() // Create RecordValue with nil Fields map (edge case) recordWithNilFields := &schema_pb.RecordValue{ Fields: nil, // This should be handled gracefully } protobufData, err := proto.Marshal(recordWithNilFields) assert.NoError(t, err) logEntry := &filer_pb.LogEntry{ TsNs: 1609459200000000000, PartitionKeyHash: 789, Data: protobufData, Key: []byte("nil-fields-key"), } // Test the conversion result, source, err := engine.convertLogEntryToRecordValue(logEntry) // Should succeed and create Fields map assert.NoError(t, err) assert.Equal(t, "live_log", source) assert.NotNil(t, result) assert.NotNil(t, result.Fields) // Should be created by the function // Should have system columns assert.Contains(t, result.Fields, SW_COLUMN_NAME_TIMESTAMP) assert.Contains(t, result.Fields, SW_COLUMN_NAME_KEY) assert.Equal(t, int64(1609459200000000000), result.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()) assert.Equal(t, []byte("nil-fields-key"), result.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()) } func TestSQLEngine_ConvertLogEntryToRecordValue_SystemColumnOverride(t *testing.T) { engine := NewTestSQLEngine() // Create RecordValue that already has system column names (should be overridden) recordWithSystemCols := &schema_pb.RecordValue{ Fields: map[string]*schema_pb.Value{ "user_field": {Kind: &schema_pb.Value_StringValue{StringValue: "user-data"}}, SW_COLUMN_NAME_TIMESTAMP: {Kind: &schema_pb.Value_Int64Value{Int64Value: 999999999}}, // Should be overridden SW_COLUMN_NAME_KEY: {Kind: &schema_pb.Value_StringValue{StringValue: "old-key"}}, // Should be overridden }, } protobufData, err := proto.Marshal(recordWithSystemCols) assert.NoError(t, err) logEntry := &filer_pb.LogEntry{ TsNs: 1609459200000000000, PartitionKeyHash: 100, Data: protobufData, Key: []byte("actual-key"), } // Test the conversion result, source, err := engine.convertLogEntryToRecordValue(logEntry) // Should succeed assert.NoError(t, err) assert.Equal(t, "live_log", source) assert.NotNil(t, result) // System columns should use LogEntry values, not protobuf values assert.Equal(t, int64(1609459200000000000), result.Fields[SW_COLUMN_NAME_TIMESTAMP].GetInt64Value()) assert.Equal(t, []byte("actual-key"), result.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()) // User field should be preserved assert.Contains(t, result.Fields, "user_field") assert.Equal(t, "user-data", result.Fields["user_field"].GetStringValue()) } func TestSQLEngine_ConvertLogEntryToRecordValue_ComplexDataTypes(t *testing.T) { engine := NewTestSQLEngine() // Test with various data types complexRecord := &schema_pb.RecordValue{ Fields: map[string]*schema_pb.Value{ "int32_field": {Kind: &schema_pb.Value_Int32Value{Int32Value: -42}}, "int64_field": {Kind: &schema_pb.Value_Int64Value{Int64Value: 9223372036854775807}}, "float_field": {Kind: &schema_pb.Value_FloatValue{FloatValue: 3.14159}}, "double_field": {Kind: &schema_pb.Value_DoubleValue{DoubleValue: 2.718281828}}, "bool_field": {Kind: &schema_pb.Value_BoolValue{BoolValue: true}}, "string_field": {Kind: &schema_pb.Value_StringValue{StringValue: "test string with unicode 🎉"}}, "bytes_field": {Kind: &schema_pb.Value_BytesValue{BytesValue: []byte{0x01, 0x02, 0x03}}}, }, } protobufData, err := proto.Marshal(complexRecord) assert.NoError(t, err) logEntry := &filer_pb.LogEntry{ TsNs: 1609459200000000000, PartitionKeyHash: 200, Data: protobufData, Key: []byte("complex-key"), } // Test the conversion result, source, err := engine.convertLogEntryToRecordValue(logEntry) // Should succeed assert.NoError(t, err) assert.Equal(t, "live_log", source) assert.NotNil(t, result) // Verify all data types are preserved assert.Equal(t, int32(-42), result.Fields["int32_field"].GetInt32Value()) assert.Equal(t, int64(9223372036854775807), result.Fields["int64_field"].GetInt64Value()) assert.Equal(t, float32(3.14159), result.Fields["float_field"].GetFloatValue()) assert.Equal(t, 2.718281828, result.Fields["double_field"].GetDoubleValue()) assert.Equal(t, true, result.Fields["bool_field"].GetBoolValue()) assert.Equal(t, "test string with unicode 🎉", result.Fields["string_field"].GetStringValue()) assert.Equal(t, []byte{0x01, 0x02, 0x03}, result.Fields["bytes_field"].GetBytesValue()) // System columns should still be present assert.Contains(t, result.Fields, SW_COLUMN_NAME_TIMESTAMP) assert.Contains(t, result.Fields, SW_COLUMN_NAME_KEY) } // Tests for log buffer deduplication functionality func TestSQLEngine_GetLogBufferStartFromFile_BinaryFormat(t *testing.T) { engine := NewTestSQLEngine() // Create sample buffer start (binary format) bufferStartBytes := make([]byte, 8) binary.BigEndian.PutUint64(bufferStartBytes, uint64(1609459100000000001)) // Create file entry with buffer start + some chunks entry := &filer_pb.Entry{ Name: "test-log-file", Extended: map[string][]byte{ "buffer_start": bufferStartBytes, }, Chunks: []*filer_pb.FileChunk{ {FileId: "chunk1", Offset: 0, Size: 1000}, {FileId: "chunk2", Offset: 1000, Size: 1000}, {FileId: "chunk3", Offset: 2000, Size: 1000}, }, } // Test extraction result, err := engine.getLogBufferStartFromFile(entry) assert.NoError(t, err) assert.NotNil(t, result) assert.Equal(t, int64(1609459100000000001), result.StartIndex) // Test extraction works correctly with the binary format } func TestSQLEngine_GetLogBufferStartFromFile_NoMetadata(t *testing.T) { engine := NewTestSQLEngine() // Create file entry without buffer start entry := &filer_pb.Entry{ Name: "test-log-file", Extended: nil, } // Test extraction result, err := engine.getLogBufferStartFromFile(entry) assert.NoError(t, err) assert.Nil(t, result) } func TestSQLEngine_GetLogBufferStartFromFile_InvalidData(t *testing.T) { engine := NewTestSQLEngine() // Create file entry with invalid buffer start (wrong size) entry := &filer_pb.Entry{ Name: "test-log-file", Extended: map[string][]byte{ "buffer_start": []byte("invalid-binary"), }, } // Test extraction result, err := engine.getLogBufferStartFromFile(entry) assert.Error(t, err) assert.Contains(t, err.Error(), "invalid buffer_start format: expected 8 bytes") assert.Nil(t, result) } func TestSQLEngine_BuildLogBufferDeduplicationMap_NoBrokerClient(t *testing.T) { engine := NewTestSQLEngine() engine.catalog.brokerClient = nil // Simulate no broker client ctx := context.Background() result, err := engine.buildLogBufferDeduplicationMap(ctx, "/topics/test/test-topic") assert.NoError(t, err) assert.NotNil(t, result) assert.Empty(t, result) } func TestSQLEngine_LogBufferDeduplication_ServerRestartScenario(t *testing.T) { // Simulate scenario: Buffer indexes are now initialized with process start time // This tests that buffer start indexes are globally unique across server restarts // Before server restart: Process 1 buffer start (3 chunks) beforeRestartStart := LogBufferStart{ StartIndex: 1609459100000000000, // Process 1 start time } // After server restart: Process 2 buffer start (3 chunks) afterRestartStart := LogBufferStart{ StartIndex: 1609459300000000000, // Process 2 start time (DIFFERENT) } // Simulate 3 chunks for each file chunkCount := int64(3) // Calculate end indexes for range comparison beforeEnd := beforeRestartStart.StartIndex + chunkCount - 1 // [start, start+2] afterStart := afterRestartStart.StartIndex // [start, start+2] // Test range overlap detection (should NOT overlap) overlaps := beforeRestartStart.StartIndex <= (afterStart+chunkCount-1) && beforeEnd >= afterStart assert.False(t, overlaps, "Buffer ranges after restart should not overlap") // Verify the start indexes are globally unique assert.NotEqual(t, beforeRestartStart.StartIndex, afterRestartStart.StartIndex, "Start indexes should be different") assert.Less(t, beforeEnd, afterStart, "Ranges should be completely separate") // Expected values: // Before restart: [1609459100000000000, 1609459100000000002] // After restart: [1609459300000000000, 1609459300000000002] expectedBeforeEnd := int64(1609459100000000002) expectedAfterStart := int64(1609459300000000000) assert.Equal(t, expectedBeforeEnd, beforeEnd) assert.Equal(t, expectedAfterStart, afterStart) // This demonstrates that buffer start indexes initialized with process start time // prevent false positive duplicates across server restarts } func TestBrokerClient_BinaryBufferStartFormat(t *testing.T) { // Test scenario: getBufferStartFromEntry should only support binary format // This tests the standardized binary format for buffer_start metadata realBrokerClient := &BrokerClient{} // Test binary format (used by both log files and Parquet files) binaryEntry := &filer_pb.Entry{ Name: "2025-01-07-14-30-45", IsDirectory: false, Extended: map[string][]byte{ "buffer_start": func() []byte { // Binary format: 8-byte BigEndian buf := make([]byte, 8) binary.BigEndian.PutUint64(buf, uint64(2000001)) return buf }(), }, } bufferStart := realBrokerClient.getBufferStartFromEntry(binaryEntry) assert.NotNil(t, bufferStart) assert.Equal(t, int64(2000001), bufferStart.StartIndex, "Should parse binary buffer_start metadata") // Test Parquet file (same binary format) parquetEntry := &filer_pb.Entry{ Name: "2025-01-07-14-30.parquet", IsDirectory: false, Extended: map[string][]byte{ "buffer_start": func() []byte { buf := make([]byte, 8) binary.BigEndian.PutUint64(buf, uint64(1500001)) return buf }(), }, } bufferStart = realBrokerClient.getBufferStartFromEntry(parquetEntry) assert.NotNil(t, bufferStart) assert.Equal(t, int64(1500001), bufferStart.StartIndex, "Should parse binary buffer_start from Parquet file") // Test missing metadata emptyEntry := &filer_pb.Entry{ Name: "no-metadata", IsDirectory: false, Extended: nil, } bufferStart = realBrokerClient.getBufferStartFromEntry(emptyEntry) assert.Nil(t, bufferStart, "Should return nil for entry without buffer_start metadata") // Test invalid format (wrong size) invalidEntry := &filer_pb.Entry{ Name: "invalid-metadata", IsDirectory: false, Extended: map[string][]byte{ "buffer_start": []byte("invalid"), }, } bufferStart = realBrokerClient.getBufferStartFromEntry(invalidEntry) assert.Nil(t, bufferStart, "Should return nil for invalid buffer_start metadata") } // TestGetSQLValAlias tests the getSQLValAlias function, particularly for SQL injection prevention func TestGetSQLValAlias(t *testing.T) { engine := &SQLEngine{} tests := []struct { name string sqlVal *SQLVal expected string desc string }{ { name: "simple string", sqlVal: &SQLVal{ Type: StrVal, Val: []byte("hello"), }, expected: "'hello'", desc: "Simple string should be wrapped in single quotes", }, { name: "string with single quote", sqlVal: &SQLVal{ Type: StrVal, Val: []byte("don't"), }, expected: "'don''t'", desc: "String with single quote should have the quote escaped by doubling it", }, { name: "string with multiple single quotes", sqlVal: &SQLVal{ Type: StrVal, Val: []byte("'malicious'; DROP TABLE users; --"), }, expected: "'''malicious''; DROP TABLE users; --'", desc: "String with SQL injection attempt should have all single quotes properly escaped", }, { name: "empty string", sqlVal: &SQLVal{ Type: StrVal, Val: []byte(""), }, expected: "''", desc: "Empty string should result in empty quoted string", }, { name: "integer value", sqlVal: &SQLVal{ Type: IntVal, Val: []byte("123"), }, expected: "123", desc: "Integer value should not be quoted", }, { name: "float value", sqlVal: &SQLVal{ Type: FloatVal, Val: []byte("123.45"), }, expected: "123.45", desc: "Float value should not be quoted", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := engine.getSQLValAlias(tt.sqlVal) assert.Equal(t, tt.expected, result, tt.desc) }) } }