|
@ -35,6 +35,12 @@ func withDebugMode(ctx context.Context) context.Context { |
|
|
return context.WithValue(ctx, debugModeKey{}, true) |
|
|
return context.WithValue(ctx, debugModeKey{}, true) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// LogBufferStart tracks the starting buffer index for a file
|
|
|
|
|
|
// Buffer indexes are monotonically increasing, count = len(chunks)
|
|
|
|
|
|
type LogBufferStart struct { |
|
|
|
|
|
StartIndex int64 `json:"start_index"` // Starting buffer index (count = len(chunks))
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// SQLEngine provides SQL query execution capabilities for SeaweedFS
|
|
|
// SQLEngine provides SQL query execution capabilities for SeaweedFS
|
|
|
// Assumptions:
|
|
|
// Assumptions:
|
|
|
// 1. MQ namespaces map directly to SQL databases
|
|
|
// 1. MQ namespaces map directly to SQL databases
|
|
@ -1994,7 +2000,7 @@ func (e *SQLEngine) extractParquetSourceFiles(fileStats []*ParquetFileStats) map |
|
|
return sourceFiles |
|
|
return sourceFiles |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// countLiveLogRowsExcludingParquetSources counts live log rows but excludes files that were converted to parquet
|
|
|
|
|
|
|
|
|
// countLiveLogRowsExcludingParquetSources counts live log rows but excludes files that were converted to parquet and duplicate log buffer data
|
|
|
func (e *SQLEngine) countLiveLogRowsExcludingParquetSources(ctx context.Context, partitionPath string, parquetSourceFiles map[string]bool) (int64, error) { |
|
|
func (e *SQLEngine) countLiveLogRowsExcludingParquetSources(ctx context.Context, partitionPath string, parquetSourceFiles map[string]bool) (int64, error) { |
|
|
filerClient, err := e.catalog.brokerClient.GetFilerClient() |
|
|
filerClient, err := e.catalog.brokerClient.GetFilerClient() |
|
|
if err != nil { |
|
|
if err != nil { |
|
@ -2009,9 +2015,23 @@ func (e *SQLEngine) countLiveLogRowsExcludingParquetSources(ctx context.Context, |
|
|
actualSourceFiles = parquetSourceFiles |
|
|
actualSourceFiles = parquetSourceFiles |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Second, get duplicate files from log buffer metadata
|
|
|
|
|
|
logBufferDuplicates, err := e.buildLogBufferDeduplicationMap(ctx, partitionPath) |
|
|
|
|
|
if err != nil { |
|
|
|
|
|
if isDebugMode(ctx) { |
|
|
|
|
|
fmt.Printf("Warning: failed to build log buffer deduplication map: %v\n", err) |
|
|
|
|
|
} |
|
|
|
|
|
logBufferDuplicates = make(map[string]bool) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// Debug: Show deduplication status (only in explain mode)
|
|
|
// Debug: Show deduplication status (only in explain mode)
|
|
|
if isDebugMode(ctx) && len(actualSourceFiles) > 0 { |
|
|
|
|
|
fmt.Printf("Excluding %d converted log files from %s\n", len(actualSourceFiles), partitionPath) |
|
|
|
|
|
|
|
|
if isDebugMode(ctx) { |
|
|
|
|
|
if len(actualSourceFiles) > 0 { |
|
|
|
|
|
fmt.Printf("Excluding %d converted log files from %s\n", len(actualSourceFiles), partitionPath) |
|
|
|
|
|
} |
|
|
|
|
|
if len(logBufferDuplicates) > 0 { |
|
|
|
|
|
fmt.Printf("Excluding %d duplicate log buffer files from %s\n", len(logBufferDuplicates), partitionPath) |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
totalRows := int64(0) |
|
|
totalRows := int64(0) |
|
@ -2028,6 +2048,14 @@ func (e *SQLEngine) countLiveLogRowsExcludingParquetSources(ctx context.Context, |
|
|
return nil |
|
|
return nil |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Skip files that are duplicated due to log buffer metadata
|
|
|
|
|
|
if logBufferDuplicates[entry.Name] { |
|
|
|
|
|
if isDebugMode(ctx) { |
|
|
|
|
|
fmt.Printf("Skipping %s (duplicate log buffer data)\n", entry.Name) |
|
|
|
|
|
} |
|
|
|
|
|
return nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// Count rows in live log file
|
|
|
// Count rows in live log file
|
|
|
rowCount, err := e.countRowsInLogFile(filerClient, partitionPath, entry) |
|
|
rowCount, err := e.countRowsInLogFile(filerClient, partitionPath, entry) |
|
|
if err != nil { |
|
|
if err != nil { |
|
@ -2070,6 +2098,96 @@ func (e *SQLEngine) getParquetSourceFilesFromMetadata(partitionPath string) (map |
|
|
return sourceFiles, err |
|
|
return sourceFiles, err |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// getLogBufferStartFromFile reads buffer start from file extended attributes
|
|
|
|
|
|
func (e *SQLEngine) getLogBufferStartFromFile(entry *filer_pb.Entry) (*LogBufferStart, error) { |
|
|
|
|
|
if entry.Extended == nil { |
|
|
|
|
|
return nil, nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Only support buffer_start format
|
|
|
|
|
|
if startJson, exists := entry.Extended["buffer_start"]; exists { |
|
|
|
|
|
var bufferStart LogBufferStart |
|
|
|
|
|
if err := json.Unmarshal(startJson, &bufferStart); err != nil { |
|
|
|
|
|
return nil, fmt.Errorf("failed to parse buffer start: %v", err) |
|
|
|
|
|
} |
|
|
|
|
|
return &bufferStart, nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return nil, nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// buildLogBufferDeduplicationMap creates a map to track duplicate files based on buffer ranges (ultra-efficient)
|
|
|
|
|
|
func (e *SQLEngine) buildLogBufferDeduplicationMap(ctx context.Context, partitionPath string) (map[string]bool, error) { |
|
|
|
|
|
if e.catalog.brokerClient == nil { |
|
|
|
|
|
return make(map[string]bool), nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
filerClient, err := e.catalog.brokerClient.GetFilerClient() |
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return make(map[string]bool), nil // Don't fail the query, just skip deduplication
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Track buffer ranges instead of individual indexes (much more efficient)
|
|
|
|
|
|
type BufferRange struct { |
|
|
|
|
|
start, end int64 |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
processedRanges := make([]BufferRange, 0) |
|
|
|
|
|
duplicateFiles := make(map[string]bool) |
|
|
|
|
|
|
|
|
|
|
|
err = filer_pb.ReadDirAllEntries(context.Background(), filerClient, util.FullPath(partitionPath), "", func(entry *filer_pb.Entry, isLast bool) error { |
|
|
|
|
|
if entry.IsDirectory || strings.HasSuffix(entry.Name, ".parquet") { |
|
|
|
|
|
return nil // Skip directories and parquet files
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Get buffer start for this file (most efficient)
|
|
|
|
|
|
bufferStart, err := e.getLogBufferStartFromFile(entry) |
|
|
|
|
|
if err != nil || bufferStart == nil { |
|
|
|
|
|
return nil // No buffer info, can't deduplicate
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Calculate range for this file: [start, start + chunkCount - 1]
|
|
|
|
|
|
chunkCount := int64(len(entry.GetChunks())) |
|
|
|
|
|
if chunkCount == 0 { |
|
|
|
|
|
return nil // Empty file, skip
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fileRange := BufferRange{ |
|
|
|
|
|
start: bufferStart.StartIndex, |
|
|
|
|
|
end: bufferStart.StartIndex + chunkCount - 1, |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Check if this range overlaps with any processed range
|
|
|
|
|
|
isDuplicate := false |
|
|
|
|
|
for _, processedRange := range processedRanges { |
|
|
|
|
|
if fileRange.start <= processedRange.end && fileRange.end >= processedRange.start { |
|
|
|
|
|
// Ranges overlap - this file contains duplicate buffer indexes
|
|
|
|
|
|
isDuplicate = true |
|
|
|
|
|
if isDebugMode(ctx) { |
|
|
|
|
|
fmt.Printf("Marking %s as duplicate (buffer range [%d-%d] overlaps with [%d-%d])\n", |
|
|
|
|
|
entry.Name, fileRange.start, fileRange.end, processedRange.start, processedRange.end) |
|
|
|
|
|
} |
|
|
|
|
|
break |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if isDuplicate { |
|
|
|
|
|
duplicateFiles[entry.Name] = true |
|
|
|
|
|
} else { |
|
|
|
|
|
// Add this range to processed ranges
|
|
|
|
|
|
processedRanges = append(processedRanges, fileRange) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return nil |
|
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return make(map[string]bool), nil // Don't fail the query
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return duplicateFiles, nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// countRowsInLogFile counts rows in a single log file using SeaweedFS patterns
|
|
|
// countRowsInLogFile counts rows in a single log file using SeaweedFS patterns
|
|
|
func (e *SQLEngine) countRowsInLogFile(filerClient filer_pb.FilerClient, partitionPath string, entry *filer_pb.Entry) (int64, error) { |
|
|
func (e *SQLEngine) countRowsInLogFile(filerClient filer_pb.FilerClient, partitionPath string, entry *filer_pb.Entry) (int64, error) { |
|
|
lookupFileIdFn := filer.LookupFn(filerClient) |
|
|
lookupFileIdFn := filer.LookupFn(filerClient) |
|
|