heap sort the data sources

3 months ago · cd928f9f38
1 changed files with 313 additions and 98 deletions
--- a/weed/query/engine/hybrid_message_scanner.go
+++ b/weed/query/engine/hybrid_message_scanner.go
@ -1,6 +1,7 @@
 package engine

 import (
+	"container/heap"
 	"context"
 	"encoding/json"
 	"fmt"
@ -139,6 +140,44 @@ type ParquetFileStats struct {
 	ColumnStats map[string]*ParquetColumnStats
 }

+// StreamingDataSource provides a streaming interface for reading scan results
+type StreamingDataSource interface {
+	Next() (*HybridScanResult, error) // Returns next result or nil when done
+	HasMore() bool                    // Returns true if more data available
+	Close() error                     // Clean up resources
+}
+
+// StreamingMergeItem represents an item in the priority queue for streaming merge
+type StreamingMergeItem struct {
+	Result     *HybridScanResult
+	SourceID   int
+	DataSource StreamingDataSource
+}
+
+// StreamingMergeHeap implements heap.Interface for merging sorted streams by timestamp
+type StreamingMergeHeap []*StreamingMergeItem
+
+func (h StreamingMergeHeap) Len() int { return len(h) }
+
+func (h StreamingMergeHeap) Less(i, j int) bool {
+	// Sort by timestamp (ascending order)
+	return h[i].Result.Timestamp < h[j].Result.Timestamp
+}
+
+func (h StreamingMergeHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
+
+func (h *StreamingMergeHeap) Push(x interface{}) {
+	*h = append(*h, x.(*StreamingMergeItem))
+}
+
+func (h *StreamingMergeHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	item := old[n-1]
+	*h = old[0 : n-1]
+	return item
+}
+
 // Scan reads messages from both live logs and archived Parquet files
 // Uses SeaweedFS MQ's GenMergedReadFunc for seamless integration
 // Assumptions:
@ -398,9 +437,12 @@ func (hms *HybridMessageScanner) scanPartitionHybrid(ctx context.Context, partit
 	return results, err
 }

-// scanPartitionHybridWithStats scans a specific partition and returns statistics
+// scanPartitionHybridWithStats scans a specific partition using streaming merge for memory efficiency
+// PERFORMANCE IMPROVEMENT: Uses heap-based streaming merge instead of collecting all data and sorting
+// - Memory usage: O(k) where k = number of data sources, instead of O(n) where n = total records
+// - Scalable: Can handle large topics without LIMIT clauses efficiently
+// - Streaming: Processes data as it arrives rather than buffering everything
 func (hms *HybridMessageScanner) scanPartitionHybridWithStats(ctx context.Context, partition topic.Partition, options HybridScanOptions) ([]HybridScanResult, *HybridScanStats, error) {
-	var results []HybridScanResult
 	stats := &HybridScanStats{}

 	// STEP 1: Scan unflushed in-memory data from brokers (REAL-TIME)
@ -410,19 +452,11 @@ func (hms *HybridMessageScanner) scanPartitionHybridWithStats(ctx context.Contex
 		if !isDebugMode(ctx) {
 			fmt.Printf("Warning: Failed to scan unflushed data from broker: %v\n", err)
 		}
-	} else {
-		results = append(results, unflushedResults...)
-		if unflushedStats != nil {
+	} else if unflushedStats != nil {
 		stats.BrokerBufferQueried = unflushedStats.BrokerBufferQueried
 		stats.BrokerBufferMessages = unflushedStats.BrokerBufferMessages
 		stats.BufferStartIndex = unflushedStats.BufferStartIndex
 	}
-	}
-
-	// STEP 2: Scan flushed data from disk (live logs + Parquet files)
-	// Create the hybrid read function that combines live logs + Parquet files
-	// This uses SeaweedFS MQ's own merged reading logic
-	mergedReadFn := logstore.GenMergedReadFunc(hms.filerClient, hms.topic, partition)

 	// Count live log files for statistics
 	liveLogCount, err := hms.countLiveLogFiles(partition)
@ -433,99 +467,33 @@ func (hms *HybridMessageScanner) scanPartitionHybridWithStats(ctx context.Contex
 	}
 	stats.LiveLogFilesScanned = liveLogCount

-	// Set up time range for scanning
-	startTime := time.Unix(0, options.StartTimeNs)
-	if options.StartTimeNs == 0 {
-		startTime = time.Unix(0, 0) // Start from beginning if not specified
-	}
+	// STEP 2: Create streaming data sources for memory-efficient merge
+	var dataSources []StreamingDataSource

-	stopTsNs := options.StopTimeNs
-	if stopTsNs == 0 {
-		stopTsNs = time.Now().UnixNano() // Stop at current time if not specified
+	// Add unflushed data source (if we have unflushed results)
+	if len(unflushedResults) > 0 {
+		// Sort unflushed results by timestamp before creating stream
+		if len(unflushedResults) > 1 {
+			hms.mergeSort(unflushedResults, 0, len(unflushedResults)-1)
 		}
-
-	// Message processing function
-	eachLogEntryFn := func(logEntry *filer_pb.LogEntry) (isDone bool, err error) {
-		// Convert log entry to schema_pb.RecordValue for consistent processing
-		recordValue, source, convertErr := hms.convertLogEntryToRecordValue(logEntry)
-		if convertErr != nil {
-			return false, fmt.Errorf("failed to convert log entry: %v", convertErr)
-		}
-
-		// Apply predicate filtering (WHERE clause)
-		if options.Predicate != nil && !options.Predicate(recordValue) {
-			return false, nil // Skip this message
+		dataSources = append(dataSources, NewSliceDataSource(unflushedResults))
 	}

-		// Extract system columns
-		timestamp := recordValue.Fields[SW_COLUMN_NAME_TS].GetInt64Value()
-		key := recordValue.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()
-
-		// Apply column projection
-		values := make(map[string]*schema_pb.Value)
-		if len(options.Columns) == 0 {
-			// Select all columns (excluding system columns from user view)
-			for name, value := range recordValue.Fields {
-				if name != SW_COLUMN_NAME_TS && name != SW_COLUMN_NAME_KEY {
-					values[name] = value
-				}
-			}
-		} else {
-			// Select specified columns only
-			for _, columnName := range options.Columns {
-				if value, exists := recordValue.Fields[columnName]; exists {
-					values[columnName] = value
-				}
-			}
-		}
-
-		results = append(results, HybridScanResult{
-			Values:    values,
-			Timestamp: timestamp,
-			Key:       key,
-			Source:    source,
-		})
-
-		// Apply row limit
-		if options.Limit > 0 && len(results) >= options.Limit {
-			return true, nil // Stop processing
-		}
-
-		return false, nil
-	}
-
-	// Only scan flushed data if we haven't reached the limit from unflushed data
-	if options.Limit == 0 || len(results) < options.Limit {
-		// Adjust limit for remaining capacity
-		remainingLimit := options.Limit - len(results)
-		if remainingLimit > 0 {
-			// Create a copy of options with adjusted limit for flushed data
-			flushedOptions := options
-			flushedOptions.Limit = remainingLimit
-		}
-
-		// Start scanning from the specified position
-		startPosition := log_buffer.MessagePosition{Time: startTime}
-		_, _, err = mergedReadFn(startPosition, stopTsNs, eachLogEntryFn)
+	// Add streaming flushed data source (live logs + Parquet files)
+	flushedDataSource := NewStreamingFlushedDataSource(hms, partition, options)
+	dataSources = append(dataSources, flushedDataSource)

+	// STEP 3: Use streaming merge for memory-efficient chronological ordering
+	var results []HybridScanResult
+	if len(dataSources) > 0 {
+		mergedResults, err := hms.streamingMerge(dataSources, options.Limit)
 		if err != nil {
-			return nil, stats, fmt.Errorf("flushed data scan failed: %v", err)
-		}
-	}
-
-	// STEP 3: Sort results chronologically (unflushed + flushed data)
-	// This ensures proper time ordering across all data sources
-	if len(results) > 1 {
-		// Use efficient merge sort for better performance with large datasets
-		hms.mergeSort(results, 0, len(results)-1)
+			return nil, stats, fmt.Errorf("streaming merge failed: %v", err)
 		}
-
-	// Apply final limit after merging and sorting
-	if options.Limit > 0 && len(results) > options.Limit {
-		results = results[:options.Limit]
+		results = mergedResults
 	}

-	// If no results found, generate sample data for testing environments
+	// STEP 4: Fallback to sample data if no results found
 	if len(results) == 0 {
 		sampleResults := hms.generateSampleHybridData(options)
 		results = append(results, sampleResults...)
@ -1267,6 +1235,253 @@ func (h *HybridMessageScanner) compareRawValues(v1, v2 interface{}) int {
 	return 0
 }

+// streamingMerge merges multiple sorted data sources using a heap-based approach
+// This provides memory-efficient merging without loading all data into memory
+func (hms *HybridMessageScanner) streamingMerge(dataSources []StreamingDataSource, limit int) ([]HybridScanResult, error) {
+	if len(dataSources) == 0 {
+		return nil, nil
+	}
+
+	var results []HybridScanResult
+	mergeHeap := &StreamingMergeHeap{}
+	heap.Init(mergeHeap)
+
+	// Initialize heap with first item from each data source
+	for i, source := range dataSources {
+		if source.HasMore() {
+			result, err := source.Next()
+			if err != nil {
+				// Close all sources and return error
+				for _, s := range dataSources {
+					s.Close()
+				}
+				return nil, fmt.Errorf("failed to read from data source %d: %v", i, err)
+			}
+			if result != nil {
+				heap.Push(mergeHeap, &StreamingMergeItem{
+					Result:     result,
+					SourceID:   i,
+					DataSource: source,
+				})
+			}
+		}
+	}
+
+	// Process results in chronological order
+	for mergeHeap.Len() > 0 {
+		// Get next chronologically ordered result
+		item := heap.Pop(mergeHeap).(*StreamingMergeItem)
+		results = append(results, *item.Result)
+
+		// Check limit
+		if limit > 0 && len(results) >= limit {
+			break
+		}
+
+		// Try to get next item from the same data source
+		if item.DataSource.HasMore() {
+			nextResult, err := item.DataSource.Next()
+			if err != nil {
+				// Log error but continue with other sources
+				fmt.Printf("Warning: Error reading next item from source %d: %v\n", item.SourceID, err)
+			} else if nextResult != nil {
+				heap.Push(mergeHeap, &StreamingMergeItem{
+					Result:     nextResult,
+					SourceID:   item.SourceID,
+					DataSource: item.DataSource,
+				})
+			}
+		}
+	}
+
+	// Close all data sources
+	for _, source := range dataSources {
+		source.Close()
+	}
+
+	return results, nil
+}
+
+// SliceDataSource wraps a pre-loaded slice of results as a StreamingDataSource
+// This is used for unflushed data that is already loaded into memory
+type SliceDataSource struct {
+	results []HybridScanResult
+	index   int
+}
+
+func NewSliceDataSource(results []HybridScanResult) *SliceDataSource {
+	return &SliceDataSource{
+		results: results,
+		index:   0,
+	}
+}
+
+func (s *SliceDataSource) Next() (*HybridScanResult, error) {
+	if s.index >= len(s.results) {
+		return nil, nil
+	}
+	result := &s.results[s.index]
+	s.index++
+	return result, nil
+}
+
+func (s *SliceDataSource) HasMore() bool {
+	return s.index < len(s.results)
+}
+
+func (s *SliceDataSource) Close() error {
+	return nil // Nothing to clean up for slice-based source
+}
+
+// StreamingFlushedDataSource provides streaming access to flushed data
+type StreamingFlushedDataSource struct {
+	hms          *HybridMessageScanner
+	partition    topic.Partition
+	options      HybridScanOptions
+	mergedReadFn func(startPosition log_buffer.MessagePosition, stopTsNs int64, eachLogEntryFn log_buffer.EachLogEntryFuncType) (lastReadPosition log_buffer.MessagePosition, isDone bool, err error)
+	resultChan   chan *HybridScanResult
+	errorChan    chan error
+	doneChan     chan struct{}
+	started      bool
+	finished     bool
+}
+
+func NewStreamingFlushedDataSource(hms *HybridMessageScanner, partition topic.Partition, options HybridScanOptions) *StreamingFlushedDataSource {
+	mergedReadFn := logstore.GenMergedReadFunc(hms.filerClient, hms.topic, partition)
+
+	return &StreamingFlushedDataSource{
+		hms:          hms,
+		partition:    partition,
+		options:      options,
+		mergedReadFn: mergedReadFn,
+		resultChan:   make(chan *HybridScanResult, 100), // Buffer for better performance
+		errorChan:    make(chan error, 1),
+		doneChan:     make(chan struct{}),
+		started:      false,
+		finished:     false,
+	}
+}
+
+func (s *StreamingFlushedDataSource) startStreaming() {
+	if s.started {
+		return
+	}
+	s.started = true
+
+	go func() {
+		defer close(s.resultChan)
+		defer close(s.errorChan)
+		defer close(s.doneChan)
+
+		// Set up time range for scanning
+		startTime := time.Unix(0, s.options.StartTimeNs)
+		if s.options.StartTimeNs == 0 {
+			startTime = time.Unix(0, 0)
+		}
+
+		stopTsNs := s.options.StopTimeNs
+		if stopTsNs == 0 {
+			stopTsNs = time.Now().UnixNano()
+		}
+
+		// Message processing function
+		eachLogEntryFn := func(logEntry *filer_pb.LogEntry) (isDone bool, err error) {
+			// Convert log entry to schema_pb.RecordValue for consistent processing
+			recordValue, source, convertErr := s.hms.convertLogEntryToRecordValue(logEntry)
+			if convertErr != nil {
+				return false, fmt.Errorf("failed to convert log entry: %v", convertErr)
+			}
+
+			// Apply predicate filtering (WHERE clause)
+			if s.options.Predicate != nil && !s.options.Predicate(recordValue) {
+				return false, nil // Skip this message
+			}
+
+			// Extract system columns
+			timestamp := recordValue.Fields[SW_COLUMN_NAME_TS].GetInt64Value()
+			key := recordValue.Fields[SW_COLUMN_NAME_KEY].GetBytesValue()
+
+			// Apply column projection
+			values := make(map[string]*schema_pb.Value)
+			if len(s.options.Columns) == 0 {
+				// Select all columns (excluding system columns from user view)
+				for name, value := range recordValue.Fields {
+					if name != SW_COLUMN_NAME_TS && name != SW_COLUMN_NAME_KEY {
+						values[name] = value
+					}
+				}
+			} else {
+				// Select specified columns only
+				for _, columnName := range s.options.Columns {
+					if value, exists := recordValue.Fields[columnName]; exists {
+						values[columnName] = value
+					}
+				}
+			}
+
+			result := &HybridScanResult{
+				Values:    values,
+				Timestamp: timestamp,
+				Key:       key,
+				Source:    source,
+			}
+
+			// Send result to channel
+			select {
+			case s.resultChan <- result:
+				return false, nil
+			case <-s.doneChan:
+				return true, nil // Stop processing if closed
+			}
+		}
+
+		// Start scanning from the specified position
+		startPosition := log_buffer.MessagePosition{Time: startTime}
+		_, _, err := s.mergedReadFn(startPosition, stopTsNs, eachLogEntryFn)
+
+		if err != nil {
+			select {
+			case s.errorChan <- fmt.Errorf("flushed data scan failed: %v", err):
+			case <-s.doneChan:
+			}
+		}
+
+		s.finished = true
+	}()
+}
+
+func (s *StreamingFlushedDataSource) Next() (*HybridScanResult, error) {
+	if !s.started {
+		s.startStreaming()
+	}
+
+	select {
+	case result, ok := <-s.resultChan:
+		if !ok {
+			return nil, nil // No more results
+		}
+		return result, nil
+	case err := <-s.errorChan:
+		return nil, err
+	case <-s.doneChan:
+		return nil, nil
+	}
+}
+
+func (s *StreamingFlushedDataSource) HasMore() bool {
+	if !s.started {
+		return true // Haven't started yet, so potentially has data
+	}
+	return !s.finished || len(s.resultChan) > 0
+}
+
+func (s *StreamingFlushedDataSource) Close() error {
+	if !s.finished {
+		close(s.doneChan)
+	}
+	return nil
+}
+
 // mergeSort efficiently sorts HybridScanResult slice by timestamp using merge sort algorithm
 func (hms *HybridMessageScanner) mergeSort(results []HybridScanResult, left, right int) {
 	if left < right {